# Consume a Topic to Observe its Data

##Common Definitions
We define a series of parameters of our current environment

In [ ]:
val sourceTopic = "sensor-processed"
val kafkaBootstrapServer = "172.17.0.2:9092" // local

sourceTopic: String = sensor-processed
kafkaBootstrapServer: String = 172.17.0.2:9092


# Read and Visualize a Stream from Kafka

In [ ]:
val rawData = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", sourceTopic)
      .option("startingOffsets", "latest")
      .load()

rawData: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [ ]:
import org.apache.spark.sql.Encoders
// Schema definition as case class
case class SensorData(id: String, ts: Long, value: Double)
// schema definition as SparkSQL struct
val schema = Encoders.product[SensorData].schema

import org.apache.spark.sql.Encoders
defined class SensorData
schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(value,DoubleType,false))


In [ ]:
val rawValues = rawData.selectExpr("CAST(value AS STRING)").as[String]
val jsonValues = rawValues.select(from_json($"value", schema) as "record")
val sensorData = jsonValues.select("record.*").as[SensorData]

rawValues: org.apache.spark.sql.Dataset[String] = [value: string]
jsonValues: org.apache.spark.sql.DataFrame = [record: struct<id: string, ts: bigint ... 1 more field>]
sensorData: org.apache.spark.sql.Dataset[SensorData] = [id: string, ts: bigint ... 1 more field]


In [ ]:
val visualizationQuery = sensorData.writeStream
  .queryName("visualization")    // this query name will be the SQL table name
  .outputMode("append")
  .format("memory")
  .start()

visualizationQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@3752c6dd


In [ ]:
val movingAvgDF = sparkSession.sql("select * from visualization")

movingAvgDF: org.apache.spark.sql.DataFrame = [id: string, ts: bigint ... 1 more field]


In [ ]:
val dummy = Seq((System.currentTimeMillis, 0.1), (System.currentTimeMillis, 0.1))
val chart = CustomPlotlyChart(dummy,
                  layout=s"{title: 'moving average sensor data', xaxis: {title: 'time(seconds)'}, yaxis: {title: 'value'}}",
                  dataOptions="""{type: 'line'}""",
                  dataSources="{x: '_1', y: '_2'}")

dummy: Seq[(Long, Double)] = List((1544093751240,0.1), (1544093751240,0.1))
chart: notebook.front.widgets.charts.CustomPlotlyChart[Seq[(Long, Double)]] = <CustomPlotlyChart widget>


In [ ]:
@volatile var running = true

running: Boolean = true


In [ ]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.concurrent.duration._
import scala.annotation.tailrec

val updater = new Thread() {
  @tailrec
  def visualize(): Unit = {
    val lastMinute: Long = System.currentTimeMillis/1000 - 5.minute.toSeconds
    val data = movingAvgDF.select($"ts".cast(LongType) as "timestamp", $"value")
                   .where($"timestamp" > lastMinute and $"id" === "office")
                   .orderBy($"timestamp")
                   .as[(Long, Double)]
                   .collect().map{case (ts, v) => (ts  % 3600,v)}
    if (data.size > 0 )chart.applyOn(data)
    if (running) {
      Thread.sleep(1.second.toMillis)
      visualize()
    } else ()
  } 
  override def run() {
    visualize()
  }
}.start()    




import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.concurrent.duration._
import scala.annotation.tailrec
updater: Unit = ()


In [ ]:
chart

res10: notebook.front.widgets.charts.CustomPlotlyChart[Seq[(Long, Double)]] = <CustomPlotlyChart widget>


In [ ]:
// running = false
// visualizationQuery.stop()
