# Chapter 21 - Streaming 
Read from a file stream. 
readStream.option("maxFilesPerTrigger", 1) simulates record streaming.

# Setup

In [21]:
import org.apache.spark.mllib.linalg._
import org.apache.spark.mllib.linalg.distributed._
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row

import org.apache.spark.sql.expressions._
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import org.apache.spark.sql.SaveMode
import org.apache.spark.sql.DataFrame

import org.apache.spark.sql.streaming.Trigger

import java.time.temporal.ChronoUnit
import java.time.{Period, LocalDate, Instant}
import java.sql.Timestamp

lastException: Throwable = null


In [22]:
%%html
<!-- To left align the HTML components in Markdown -->
<style>
table {float:left}
</style>

### Spark parition control based on core availability

In [23]:
val NUM_CORES = 4
val NUM_PARTITIONS = 3

lazy val spark: SparkSession = SparkSession.builder()
    .appName("streaming")
    .getOrCreate()

spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
/*
spark.conf.set("spark.sql.shuffle.partitions", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.default.parallelism", NUM_CORES * NUM_PARTITIONS)
spark.conf.set("spark.driver.memory", "6g")
spark.conf.set("spark.executor.memory", "2g")
spark.conf.set("spark.master", "spark://oonisim:7077")
*/
import spark.implicits._

NUM_CORES = 4
NUM_PARTITIONS = 3
spark = <lazy>


<lazy>

In [24]:
val configMap = spark.conf.getAll.foreach(println)

(spark.serializer,org.apache.spark.serializer.KryoSerializer)
(spark.driver.host,172.17.0.1)
(spark.eventLog.enabled,true)
(spark.driver.port,44625)
(spark.hadoop.validateOutputSpecs,True)
(spark.repl.class.uri,spark://172.17.0.1:44625/classes)
(spark.jars,file:/home/oonisim/.local/share/jupyter/kernels/apache_toree_scala/lib/toree-assembly-0.3.0-incubating.jar)
(spark.repl.class.outputDir,/tmp/spark-b5225f8c-4c7b-4e59-a90f-12a2ddd79937/repl-023d4f8e-18d1-4dcd-a330-6257d3dd9e4e)
(spark.app.name,streaming)
(spark.driver.memory,3g)
(spark.executor.instances,2)
(spark.history.fs.logdirectory,hdfs://oonisim:8020/logs_spark)
(spark.default.parallelism,12)
(spark.executor.id,driver)
(spark.ui.proxyBase,/proxy/application_1576122573478_0014)
(spark.submit.deployMode,client)
(spark.master,yarn)
(spark.ui.filters,org.apache.hadoop.yarn.server.webproxy.amfilter.AmIpFilter)
(spark.executor.memory,4g)
(spark.eventLog.dir,hdfs://oonisim:8020/logs_spark)
(spark.executor.cores,4)
(spark.driver.appUIA

configMap: Unit = ()


## Constants

# Tools

### Elapsed time profiler

In [25]:
import scala.collection.mutable.ListBuffer

val timing = new StringBuffer
val times = new ListBuffer[Long]()

def clear(): Unit = {
    timing.setLength(0)
    times.clear
}
def average(): Long = {
    times.reduce(_+_) / times.length
}

/**
@param label Description about the run
@code code to execute
@return execution
*/
def _timed[T](label: String, code: => T): T = {
    val start = System.currentTimeMillis()
    val result = code
    val stop = System.currentTimeMillis()
    timing.append(s"Processing $label took ${stop - start} ms.\n")
    times.append(stop - start)
    result
}

val timed = _timed _

timing = 
times = ListBuffer()
timed = > Nothing) => Nothing = <function2>


clear: ()Unit
average: ()Long
_timed: [T](label: String, code: => T)T


> Nothing) => Nothing = <function2>

In [26]:
// To flush out error: missing argument list for method timed
println("")

<console>:86: error: missing argument list for method _timed
Unapplied methods are only converted to functions when a function type is expected.
You can make this conversion explicit by writing `_timed _` or `_timed(_,_)` instead of `_timed`.
       _timed
       ^
lastException: Throwable = null


### Save to file

In [27]:
def save(df: DataFrame) = {
    df.coalesce(1)
    .write
    .format("csv")
    .mode(SaveMode.Overwrite)
    .option("header", "true")
    .save(RESULT_DIR)
}

Name: Unknown Error
Message: <console>:89: error: not found: value RESULT_DIR
           .save(RESULT_DIR)
                 ^

StackTrace: 

# Constant

In [28]:
val PROTOCOL="file://"
val DATA_DIR="/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe"

PROTOCOL = file://
DATA_DIR = /home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe


/home/oonisim/home/repositories/git/oonisim/spark-programs/Dataframe

# Main

## Schema from Dataframe
Retrieve the data schema from DataFrame.

Structured Streaming does not let you perform schema inference without explicitly enabling it. You can enable schema inference for this by setting the configuration spark.sql.streaming.schemaInference to true. 

In [29]:
val static = spark.read.json(PROTOCOL + DATA_DIR + "/data/activity-data/")
val dataSchema = static.schema

static = [Arrival_Time: bigint, Creation_Time: bigint ... 8 more fields]
dataSchema = StructType(StructField(Arrival_Time,LongType,true), StructField(Creation_Time,LongType,true), StructField(Device,StringType,true), StructField(Index,LongType,true), StructField(Model,StringType,true), StructField(User,StringType,true), StructField(gt,StringType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true))


StructType(StructField(Arrival_Time,LongType,true), StructField(Creation_Time,LongType,true), StructField(Device,StringType,true), StructField(Index,LongType,true), StructField(Model,StringType,true), StructField(User,StringType,true), StructField(gt,StringType,true), StructField(x,DoubleType,true), StructField(y,DoubleType,true), StructField(z,DoubleType,true))

In [30]:
dataSchema.foreach(println)

StructField(Arrival_Time,LongType,true)
StructField(Creation_Time,LongType,true)
StructField(Device,StringType,true)
StructField(Index,LongType,true)
StructField(Model,StringType,true)
StructField(User,StringType,true)
StructField(gt,StringType,true)
StructField(x,DoubleType,true)
StructField(y,DoubleType,true)
StructField(z,DoubleType,true)


In [31]:
static.show(3)

+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
| Arrival_Time|      Creation_Time|  Device|Index| Model|User|   gt|           x|           y|           z|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
|1424686735090|1424686733090638193|nexus4_1|   18|nexus4|   g|stand| 3.356934E-4|-5.645752E-4|-0.018814087|
|1424686735292|1424688581345918092|nexus4_2|   66|nexus4|   g|stand|-0.005722046| 0.029083252| 0.005569458|
|1424686735500|1424686733498505625|nexus4_1|   99|nexus4|   g|stand|   0.0078125|-0.017654419| 0.010025024|
+-------------+-------------------+--------+-----+------+----+-----+------------+------------+------------+
only showing top 3 rows



# Stream

In [32]:
val streaming = spark
    .readStream
    .schema(dataSchema)
    .option("maxFilesPerTrigger", 1)
    .json(PROTOCOL + DATA_DIR + "/data/activity-data")

streaming = [Arrival_Time: bigint, Creation_Time: bigint ... 8 more fields]


[Arrival_Time: bigint, Creation_Time: bigint ... 8 more fields]

In [33]:
// Transformation only. No action yet
val activityCounts = streaming
    .groupBy("gt")
    .count()

activityCounts = [gt: string, count: bigint]


[gt: string, count: bigint]

## Memory sink
write to a memory sink which keeps an in-memory table of the results.

In [34]:
val activityQuery = activityCounts
    .writeStream
    .queryName("activity_counts")
    .format("memory")
    .outputMode("complete")
    .trigger(Trigger.ProcessingTime(1000))
    .start()


/* This does not work.
val activityQuery = activityCounts
    .writeStream
    .queryName("activity_counts")
    .format("memory")
    .outputMode("complete")
    .start()
*/

activityQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3febf919


org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3febf919

In [35]:
spark.streams.active

Array(org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3febf919)

In [36]:
while(activityQuery.isActive){
    spark.sql("SELECT * FROM activity_counts").show()
    Thread.sleep(3000)
}

+---+-----+
| gt|count|
+---+-----+
+---+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|      null|31341|
|       sit|36932|
|  stairsup|31369|
|stairsdown|28087|
|      bike|32387|
|      walk|39763|
|     stand|34155|
+----------+-----+

+----------+-----+
|        gt|count|
+----------+-----+
|      null|62679|
|       sit|73861|
|  stairsup|62739|
|stairsdown|56173|
|      bike|64775|
|      walk|79529|
|     stand|68312|
+----------+-----+

+----------+------+
|        gt| count|
+----------+------+
|      null| 94019|
|       sit|110784|
|  stairsup| 94119|
|stairsdown| 84267|
|      bike| 97163|
|      walk|119293|
|     stand|102464|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|      null|125361|
|       sit|147710|
|  stairsup|125490|
|stairsdown|112357|
|      bike|129553|
|      walk|159058|
|     stand|136618|
+----------+------+

+----------+------+
|        gt| count|
+----------+------+
|      null|167143|
|       s

## Wait for Streaming termination

In [37]:
// Must specify to wait for the termination of the query using activityQuery.awaitTermination() 
// to prevent the driver process from exiting while the query is active.
// This function will block the thread.
activityQuery.awaitTermination()

Name: org.apache.spark.sql.streaming.StreamingQueryException
Message: Query activity_counts [id = e548fdf8-b721-4e16-b80e-3944342a13af, runId = 38a964ea-fd75-43e5-b36a-5f3e716ed4c8] terminated with exception: SparkContext has been shutdown
StackTrace:   at org.apache.spark.sql.execution.streaming.StreamExecution.org$apache$spark$sql$execution$streaming$StreamExecution$$runStream(StreamExecution.scala:297)
  at org.apache.spark.sql.execution.streaming.StreamExecution$$anon$1.run(StreamExecution.scala:193)
Caused by: java.lang.IllegalStateException: SparkContext has been shutdown
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2053)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2082)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2101)
  at org.apache.spark.SparkContext.runJob(SparkContext.scala:2126)
  at org.apache.spark.rdd.RDD$$anonfun$collect$1.apply(RDD.scala:945)
  at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scal