/* This Source Code Form is subject to the terms of the Mozilla Public
* License, v. 2.0. If a copy of the MPL was not distributed with this
* file, You can obtain one at */
package com.mozilla.telemetry.views
import com.mozilla.telemetry.utils.UDFs._
import com.mozilla.telemetry.utils.getOrCreateSparkSession
import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.functions._
import org.rogach.scallop._
object RetentionView extends BatchJobBase {
class Conf(args: Array[String]) extends ScallopConf(args) {
val date = opt[String]("date", descr = "Run date for this job", required = true)
val input = opt[String]("input", descr = "Source for parquet data", required = true)
val bucket = opt[String]("bucket", descr = "output bucket", required = true)
val prefix = opt[String]("prefix",
descr = "output prefix",
required = false,
default = Some("retention/v1")
val hllBits = opt[Int](
descr = "Number of bits to use for hll. 13 bits is 8192 bytes with an error of 0.0115. Defaults to 13.",
required = false,
default = Some(13)
val dimensions: List[String] = List(
val metrics: List[String] = List(
def transform(dataframe: DataFrame, hllBits: Int): DataFrame = {
val expr = List(s"hll_create(client_id, $hllBits) as hll") ++ dimensions ++ metrics
.groupBy(dimensions.head, dimensions.tail:_*)
def main(args: Array[String]) {
val conf = new Conf(args)
val spark = getOrCreateSparkSession("Retention")
val date =
val df =
val result = transform(df, conf.hllBits())
if (shouldStopContextAtEnd(spark)) { spark.stop() }
