# Chapter 21 - Streaming  Aggregation
Run aggregation on streaming.

# Setup

In [1]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.streaming.Trigger

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

In [2]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [3]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .appName("streaming")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://oonisim:7077")
*/
import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

In [4]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,172.17.0.1)
(spark.eventLog.enabled,true)
(spark.driver.port,40563)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://172.17.0.1:40563/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-802b15dd-23c3-445d-b645-f32c29b463c9/repl-9298f6a6-1b0d-4ace-b4cd-d186b70da1f2)
(spark.app.name,streaming)
(spark.driver.memory,3g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://oonisim:8020/logs_spark)
(spark.default.parallelism,12)
(spark.executor.id,driver)
(spark.submit.deployMode,client)
(spark.master,yarn)
(spark.ui.filters,org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://oonisim:8020/logs_spark)
(spark.executor.cores,4)
(spark.driver.appUIAddress,http://172.17.0.1:4040)
(spark.org.apache.hadoop.yar

configMap: Unit = ()


## Constants

# Constant

In [5]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Streaming/data"

PROTOCOL = file://
DATA_DIR = /home/oonisim/home/repositories/git/oonisim/spark-programs/Streaming


/home/oonisim/home/repositories/git/oonisim/spark-programs/Streaming

# Main

## Schema from Dataframe
Retrieve the data schema from DataFrame.

Structured Streaming does not let you perform schema inference without explicitly enabling it. You can enable schema inference for this by setting the configuration spark.sql.streaming.schemaInference to true. 

In [6]:
val static = spark.read.json(PROTOCOL + DATA_DIR + "/activity-data/")
val dataSchema = static.schema

static = [Arrival_Time: bigint, Creation_Time: bigint ... 8 more fields]
dataSchema = StructType(StructField(Arrival_Time,LongType,true), StructField(Creation_Time,LongType,true), StructField(Device,StringType,true), StructField(Index,LongType,true), StructField(Model,StringType,true), StructField(User,StringType,true), StructField(gt,StringType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true))


StructType(StructField(Arrival_Time,LongType,true), StructField(Creation_Time,LongType,true), StructField(Device,StringType,true), StructField(Index,LongType,true), StructField(Model,StringType,true), StructField(User,StringType,true), StructField(gt,StringType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true))

In [7]:
dataSchema.foreach(println)

StructField(Arrival_Time,LongType,true)
StructField(Creation_Time,LongType,true)
StructField(Device,StringType,true)
StructField(Index,LongType,true)
StructField(Model,StringType,true)
StructField(User,StringType,true)
StructField(gt,StringType,true)
StructField(x,DoubleType,true)
StructField(y,DoubleType,true)
StructField(z,DoubleType,true)


In [8]:
static.show(3)

+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand| 3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand|-0.005722046| 0.029083252| 0.005569458|
|1424686735500|1424686733498505625|nexus4_1|   99|nexus4|   g|stand|   0.0078125|-0.017654419| 0.010025024|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
only showing top 3 rows



# Input Stream

In [9]:
val streaming = spark
    .readStream
    .schema(dataSchema)
    .option("maxFilesPerTrigger", 1)
    .json(PROTOCOL + DATA_DIR + "/activity-data")

streaming = [Arrival_Time: bigint, Creation_Time: bigint ... 8 more fields]


[Arrival_Time: bigint, Creation_Time: bigint ... 8 more fields]

# Ouput Stream

In [10]:
val deviceModelStats = streaming
    .cube("gt", "model").avg()
    .drop("avg(Arrival_time)")
    .drop("avg(Creation_Time)")
    .drop("avg(Index)")
    .writeStream
    .queryName("device_counts")
    .format("memory")
    .outputMode("complete")
    .trigger(Trigger.ProcessingTime(1000))
    .start()

deviceModelStats = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3fa2d26c


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3fa2d26c

In [11]:
spark.streams.active

Array(org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3fa2d26c)

In [12]:
while(deviceModelStats.isActive){
    spark.sql("SELECT * FROM device_counts").show(false)
    Thread.sleep(3000)
}

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+---+-----+------+------+------+
| gt|model|avg(x)|avg(y)|avg(z)|
+---+-----+------+------+------+
+---+-----+------+------+------+

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|      null|nexus4|1.966600070764429...|-0.00625396643444789|-0.01041804405820...|
|stairsdown|nexus4|0.019154391360635455|-0.03176165816067268| 0.11931218905160704|
|     stand|nexus4|-8.48276359097096...|1.633235242677092...|-2.53727045751175...|
|      bike|  null|0.021842310709212158|-0.00921752557466...|-0.08129998643162427|
|      bike|nexus4|0.021842310709212158|-0.00921752557466...|-0.08129998643162427|
|      null|  null|1.966600070764429...|-0.00625396643444789|-0.010418

+----------+------+--------------------+--------------------+--------------------+
|        gt| model|              avg(x)|              avg(y)|              avg(z)|
+----------+------+--------------------+--------------------+--------------------+
|      null|nexus4|3.039784181153491...|-0.00627429121078...|-0.01007703072928...|
|stairsdown|nexus4| 0.02134421704386846|-0.03347021100214...| 0.12025228147479891|
|     stand|nexus4|-2.33770092167197...|2.221805099707795...|2.148314301834639E-5|
|      bike|  null|0.022021064927427263|-0.00857823575364...|-0.08239714732207502|
|      bike|nexus4|0.022021064927427263|-0.00857823575364...|-0.08239714732207502|
|      null|  null|3.039784181153491...|-0.00627429121078...|-0.01007703072928...|
|stairsdown|  null| 0.02134421704386846|-0.03347021100214...| 0.12025228147479891|
|       sit|  null|-5.23942285199871E-4|1.952689407138996E-4|-2.53405830516196...|
|     stand|  null|-2.33770092167197...|2.221805099707795...|2.148314301834641...|
|   

Name: java.lang.InterruptedException
Message: sleep interrupted
StackTrace:   at java.lang.Thread.sleep(Native Method)

## Wait for Streaming termination

In [13]:
// Must specify to wait for the termination of the query using activityQuery.awaitTermination() 
// to prevent the driver process from exiting while the query is active.
// This function will block the thread.
deviceModelStats.awaitTermination()

lastException = null


Name: org.apache.spark.sql.streaming.StreamingQueryException
Message: Query device_counts [id = 1e90c9ce-db90-4200-9851-d1540e183c89, runId = b9ae7752-d3e1-4e06-82a5-3163cc705496] terminated with exception: Job 61 cancelled because SparkContext was shut down
StackTrace:   at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:297)
  at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:193)
Caused by: org.apache.spark.SparkException: Job 61 cancelled because SparkContext was shut down
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:932)
  at org.apache.spark.scheduler.DAGScheduler$$anonfun$cleanUpAfterSchedulerStop$1.apply(DAGScheduler.scala:930)
  at scala.collection.mutable.HashSet.foreach(HashSet.scala:78)
  at org.apache.spark.scheduler.DAGScheduler.cleanUpAfterSchedulerStop(DAGScheduler.sca