# Chapter 7 - Aggregation - 

# Setup

In [1]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

In [2]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [3]:
val NUM_CORES = 2
val NUM_PARTITIONS = 2

lazy val spark: SparkSession = SparkSession.builder()
    .master("local")
    .appName("flight")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", 8)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://masa:7077")
*/
import spark.implicits._

NUM_CORES = 2
NUM_PARTITIONS = 2
spark = <lazy>


<lazy>

In [4]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,172.17.0.1)
(spark.eventLog.enabled,true)
(spark.driver.port,46137)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://172.17.0.1:46137/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-70df602a-bc19-469d-806a-8ade41985168/repl-67091ce4-e778-456c-91e2-dae5be477de5)
(spark.app.name,flight)
(spark.driver.memory,2g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://oonisim:8020/logs_spark)
(spark.default.parallelism,8)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,local)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://oonisim:8020/logs_spark)
(spark.executor.cores,4)
(spark.app.id,local-1576109311900)
(spark.sql.shuffle.partitions,4)


configMap: Unit = ()


## Constants

# Tools

### Elapsed time profiler

In [5]:
import scala.collection.mutable.ListBuffer

val timing = new StringBuffer
val times = new ListBuffer[Long]()

def clear(): Unit = {
    timing.setLength(0)
    times.clear
}
def average(): Long = {
    times.reduce(_+_) / times.length
}

/**
@param label Description about the run
@code code to execute
@return execution
*/
def _timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    times.append(stop - start)
    result
}

val timed = _timed _

timing = 
times = ListBuffer()
timed = > Nothing) => Nothing = <function2>


clear: ()Unit
average: ()Long
_timed: [T](label: String, code: => T)T


> Nothing) => Nothing = <function2>

In [6]:
// To flush out error: missing argument list for method timed
println("")

<console>:56: error: missing argument list for method _timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `_timed _` or `_timed(_,_)` instead of `_timed`.
       _timed
       ^
lastException: Throwable = null


### Save to file

In [7]:
def save(df: DataFrame) = {
    df.coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)
}

Name: Unknown Error
Message: <console>:59: error: not found: value RESULT_DIR
           .save(RESULT_DIR)
                 ^

StackTrace: 

# Dataframe

In [8]:
val PROTOCOL="file://"
val HOME_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe"
val df = spark.read.format("csv")
  .option("header", "true")
  .option("inferSchema", "true")
  .load(PROTOCOL + HOME_DIR + "/data/retail-data/all/*.csv")
  .coalesce(5)
df.cache()
df.createOrReplaceTempView("dfTable")

PROTOCOL = file://
HOME_DIR = /home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe
df = [InvoiceNo: string, StockCode: string ... 6 more fields]


[InvoiceNo: string, StockCode: string ... 6 more fields]

In [9]:
df.show(3, true)

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
|   536365|   85123A|WHITE HANGING HEA...|       6|12/1/2010 8:26|     2.55|     17850|United Kingdom|
|   536365|    71053| WHITE METAL LANTERN|       6|12/1/2010 8:26|     3.39|     17850|United Kingdom|
|   536365|   84406B|CREAM CUPID HEART...|       8|12/1/2010 8:26|     2.75|     17850|United Kingdom|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+
only showing top 3 rows



In [10]:
val dfWithDate = df.withColumn(
    "date", 
    to_date(col("InvoiceDate"),"MM/d/yyyy H:mm")
)
dfWithDate.createOrReplaceTempView("dfWithDate")

dfWithDate = [InvoiceNo: string, StockCode: string ... 7 more fields]


[InvoiceNo: string, StockCode: string ... 7 more fields]

# Window
Spark supports three kinds of window functions: ranking functions, analytic functions, and aggregate functions. The frame specification (the rowsBetween statement) states which rows will be included in the frame based on its reference to the current input row. 

In [11]:
val windowSpec = Window
  .partitionBy("CustomerId", "date")
  .orderBy(desc("Quantity"))
//  .rowsBetween(Window.unboundedPreceding, Window.currentRow)

windowSpec = org.apache.spark.sql.expressions.WindowSpec@5fe51dcf


org.apache.spark.sql.expressions.WindowSpec@5fe51dcf

In [12]:
val purchaseDenseRank = dense_rank().over(windowSpec)
val purchaseRank = rank().over(windowSpec)

purchaseDenseRank = DENSE_RANK() OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST unspecifiedframe$())
purchaseRank = RANK() OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST unspecifiedframe$())


RANK() OVER (PARTITION BY CustomerId, date ORDER BY Quantity DESC NULLS LAST unspecifiedframe$())

In [13]:
dfWithDate.where("CustomerId IS NOT NULL").orderBy("CustomerId")
  .select(
    col("CustomerId"),
    col("date"),
    col("Quantity"),
    purchaseRank.alias("quantityRank"),
    purchaseDenseRank.alias("quantityDenseRank")
).show()

+----------+----------+--------+------------+-----------------+
|CustomerId|      date|Quantity|quantityRank|quantityDenseRank|
+----------+----------+--------+------------+-----------------+
|     12346|2011-01-18|   74215|           1|                1|
|     12346|2011-01-18|  -74215|           2|                2|
|     12347|2010-12-07|      36|           1|                1|
|     12347|2010-12-07|      30|           2|                2|
|     12347|2010-12-07|      24|           3|                3|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|           4|                4|
|     12347|2010-12-07|      12|        

## Rollup
An aggregation across multiple groups. 

In [14]:
val dfNoNull = dfWithDate.drop()
dfNoNull.createOrReplaceTempView("dfNoNull")

dfNoNull = [InvoiceNo: string, StockCode: string ... 7 more fields]


[InvoiceNo: string, StockCode: string ... 7 more fields]

### Total & Sub Total

In [15]:
val rolledUpDF = dfNoNull.rollup("Date", "Country").agg(sum("Quantity"))
  .selectExpr("Date", "Country", "`sum(Quantity)` as total_quantity")
  .orderBy("Date")
rolledUpDF.show()

+----------+--------------+--------------+
|      Date|       Country|total_quantity|
+----------+--------------+--------------+
|      null|          null|       5176450|
|2010-12-01|United Kingdom|         23949|
|2010-12-01|       Germany|           117|
|2010-12-01|     Australia|           107|
|2010-12-01|          EIRE|           243|
|2010-12-01|        France|           449|
|2010-12-01|        Norway|          1852|
|2010-12-01|   Netherlands|            97|
|2010-12-01|          null|         26814|
|2010-12-02|          null|         21023|
|2010-12-02|          EIRE|             4|
|2010-12-02|United Kingdom|         20873|
|2010-12-02|       Germany|           146|
|2010-12-03|United Kingdom|         10439|
|2010-12-03|          null|         14830|
|2010-12-03|         Italy|           164|
|2010-12-03|       Germany|           170|
|2010-12-03|   Switzerland|           110|
|2010-12-03|       Belgium|           528|
|2010-12-03|        Poland|           140|
+----------

rolledUpDF = [Date: date, Country: string ... 1 more field]


[Date: date, Country: string ... 1 more field]

## Cube
Disect the entire data with all the combinations of specified columns.

In [16]:
// Sum of quantity with all (date, country) combination as well as over all dates.
dfNoNull.cube("Date", "Country").agg(sum(col("Quantity")))
    .where(col("Country") === "Denmark")
    .select("Date", "Country", "sum(Quantity)")
    .orderBy(expr("grouping_id()").desc)
    .show()

+----------+-------+-------------+
|      Date|Country|sum(Quantity)|
+----------+-------+-------------+
|      null|Denmark|         8188|
|2011-02-16|Denmark|          224|
|2011-06-23|Denmark|          358|
|2011-08-10|Denmark|          338|
|2011-09-26|Denmark|          230|
|2011-09-29|Denmark|          202|
|2011-10-11|Denmark|          136|
|2011-05-11|Denmark|          368|
|2011-11-17|Denmark|         1025|
|2010-12-09|Denmark|          454|
|2011-03-17|Denmark|          879|
|2011-06-20|Denmark|          393|
|2011-07-25|Denmark|          241|
|2011-08-09|Denmark|           -9|
|2011-09-21|Denmark|         1176|
|2011-10-17|Denmark|          -37|
|2011-11-30|Denmark|          512|
|2011-06-09|Denmark|          793|
|2011-10-07|Denmark|          637|
|2011-10-21|Denmark|           95|
+----------+-------+-------------+
only showing top 20 rows



### Grouping ID
Grouping ID	Description  
0: Aggregatino at combination of each columns (date, country)
1. Aggregation at the 1st column (date)
2. Aggregation at the 2nd column (ountry)
3. Total aggregation regardless columns.

In [17]:
dfNoNull
    .cube("Date", "Country")
    .agg(
        grouping_id(),
        sum(col("Quantity"))
    )
    .where(col("Country") === "Denmark")
    .select("Date", "Country", "grouping_id()", "sum(Quantity)")
    .orderBy(expr("grouping_id()").desc)
    .show(5)

+----------+-------+-------------+-------------+
|      Date|Country|grouping_id()|sum(Quantity)|
+----------+-------+-------------+-------------+
|      null|Denmark|            2|         8188|
|2011-10-11|Denmark|            0|          136|
|2011-06-23|Denmark|            0|          358|
|2011-08-10|Denmark|            0|          338|
|2011-09-26|Denmark|            0|          230|
+----------+-------+-------------+-------------+
only showing top 5 rows



In [18]:
dfNoNull
    .cube("Date", "Country")
    .agg(
        grouping_id(),
        sum(col("Quantity"))
    )
    .where(col("Date") === "2011-10-11")
    .select("Date", "Country", "grouping_id()", "sum(Quantity)")
    .orderBy(expr("grouping_id()").desc)
    .show(5)

+----------+-------+-------------+-------------+
|      Date|Country|grouping_id()|sum(Quantity)|
+----------+-------+-------------+-------------+
|2011-10-11|   null|            1|        20091|
|2011-10-11|Belgium|            0|          162|
|2011-10-11| France|            0|         2788|
|2011-10-11|Germany|            0|          567|
|2011-10-11|   EIRE|            0|         2206|
+----------+-------+-------------+-------------+
only showing top 5 rows



## Pivot

## User-Defined Aggregation Functions

In [19]:
import org.apache.spark.sql.expressions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.Row

class BoolAnd extends UserDefinedAggregateFunction {
  def inputSchema: org.apache.spark.sql.types.StructType =
    StructType(StructField("value", BooleanType) :: Nil)
  def bufferSchema: StructType = StructType(
    StructField("result", BooleanType) :: Nil
  )
  def dataType: DataType = BooleanType
  def deterministic: Boolean = true
  def initialize(buffer: MutableAggregationBuffer): Unit = {
    buffer(0) = true
  }
  def update(buffer: MutableAggregationBuffer, input: Row): Unit = {
    buffer(0) = buffer.getAs[Boolean](0) && input.getAs[Boolean](0)
  }
  def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = {
    buffer1(0) = buffer1.getAs[Boolean](0) && buffer2.getAs[Boolean](0)
  }
  def evaluate(buffer: Row): Any = {
    buffer(0)
  }
}

defined class BoolAnd


In [20]:
val ba = new BoolAnd
spark.udf.register("booland", ba)
import org.apache.spark.sql.functions._
spark.range(1)
  .selectExpr("explode(array(TRUE, TRUE, TRUE)) as t")
  .selectExpr("explode(array(TRUE, FALSE, TRUE)) as f", "t")
  .select(ba(col("t")), expr("booland(f)"))
  .show()

+----------+----------+
|booland(t)|booland(f)|
+----------+----------+
|      true|     false|
+----------+----------+



ba = BoolAnd@5eceb18b


BoolAnd@5eceb18b