# Sensor Trend Tracker

In this notebook we quickly explore some specific aspects of multi-level state management in Spark Streaming

By combining local and distributed state, we implement a simple sensor trend tracker that can help us identify and report anomalies.

## Our Streaming dataset will consist of sensor information, containing the sensorId, a timestamp, and a value.
This component is a participant in a streaming pipeline.

It expects to receive moving averages of sensor data in the form of (id, timestamp, value) 

In [ ]:
import org.apache.spark.streaming.Seconds
val topic = "sensor-processed"
val kafkaBootstrapServer = "10.2.2.191:1025"
val threshold = 4.0
val interval = Seconds(10) // seconds

import org.apache.spark.streaming.Seconds
topic: String = sensor-processed
kafkaBootstrapServer: String = 10.2.2.191:1025
threshold: Double = 3.0
interval: org.apache.spark.streaming.Duration = 10000 ms


# Create a Streaming Standard Deviation Model
Based on https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance

In [ ]:
case class M2(n:Int, mean: Double, m2:Double) {
  def variance: Option[Double] = {
    if (n<2) None else Some(m2/(n-1))
  }
  def stdev: Option[Double] = variance.map(Math.sqrt)
  }
  object M2 extends Serializable {
    val Zero = M2(0,0.0,0.0)
  }

defined class M2
defined object M2


In [ ]:
// this needs to be outside of the class b/c of Spark Notebook serialization
var entries:Map[String, M2] = Map.empty

entries: Map[String,M2] = Map()


In [ ]:
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.streaming.dstream.DStream
  
class M2Model() extends Serializable {
  
  def trainOn(dstream: DStream[(String, Double)]): Unit = {
    dstream.foreachRDD{rdd => 
                       val newEntriesRDD = rdd.map{case (id, x) => 
                                                val current = entries.get(id)
                                                val updated = current.map{case M2(n, mean, m2) => {
                                                  val np = n + 1
                                                  val delta = x - mean
                                                  val meanp = mean + delta/np
                                                  val mp2 = m2 + delta*(x - meanp)
                                                  (id, M2(np, meanp, mp2))
                                                  }
                                                 }.getOrElse(id -> M2.Zero)
                                                 updated
                                               }
                       val newEntries: Array[(String, M2)] = newEntriesRDD.collect
                       entries = entries ++ newEntries
                      }
  }
  def predictOnValues(dstream: DStream[(String, Double)]): DStream[(String, Double, Double, Double)] = {
    for { 
      (id, value) <- dstream
      m2 <- entries.get(id)
      stdev <- m2.stdev
    } yield (id, value, m2.mean, stdev)
  }
}

             (id, value) <- dstream
                            ^
import org.apache.spark.rdd.RDD
import org.apache.spark.SparkContext
import org.apache.spark.streaming.dstream.DStream
defined class M2Model


## We create our Streaming Context

In [ ]:
import org.apache.spark.streaming.StreamingContext
@transient val streamingContext = new StreamingContext(sparkContext, interval)

import org.apache.spark.streaming.StreamingContext
streamingContext: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@3d4fdcb4


## Our stream source will be a a Direct Kafka Stream


In [ ]:
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._

val kafkaParams = Map[String, String](
  "metadata.broker.list" -> kafkaBootstrapServer,
  "group.id" -> "sensor-tracker-group",
  "auto.offset.reset" -> "largest",
  "enable.auto.commit" -> (false: java.lang.Boolean).toString
)

val topics = Set(topic)
@transient val stream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
     streamingContext, kafkaParams, topics)

// kafka_010 APIs don't work on the Spark Notebook

// @transient val stream = KafkaUtils.createDirectStream[String, String](
//   streamingContext,
//   PreferConsistent,
//   Subscribe[String, String](topics, kafkaParams)
// )



import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._
kafkaParams: scala.collection.immutable.Map[String,String] = Map(metadata.broker.list -> 10.2.2.191:1025, group.id -> sensor-tracker-group, auto.offset.reset -> largest, enable.auto.commit -> false)
topics: scala.collection.immutable.Set[String] = Set(sensor-processed)
stream: org.apache.spark.streaming.dstream.InputDStream[(String, String)] = org.apache.spark.streaming.kafka.DirectKafkaInputDStream@69e8a66d


# Providing Schema information for our streaming data
Now that we have a DStream of fresh data processed in a 2-second interval, we can start focusing on the gist of this example.
First, we want to define and apply a schema to the data we are receiving.
In Scala, we can define a schema with a `case class`

In [ ]:
case class SensorData(id: String, timestamp: Long, temp: Double)

defined class SensorData


# Create our Model
We will train an online standard deviation algorithm and use it to score the incoming data.

In [ ]:
val model = new M2Model()

model: M2Model = M2Model@6c24d68d


# Convert the incoming JSON to `SensorData`

In [ ]:
val spark = sparkSession
import spark.implicits._
@transient val sensorDataStream = stream.transform{rdd => 
                                        val jsonData = rdd.map{case (k,v)  => v}
                                        val ds = sparkSession.createDataset(jsonData)
                                        val jsonDF = spark.read.json(ds)
                                        val sensorDataDS = jsonDF.as[SensorData]
                                        sensorDataDS.rdd
                                       }

spark: org.apache.spark.sql.SparkSession = org.apache.spark.sql.SparkSession@50088e2d
import spark.implicits._
sensorDataStream: org.apache.spark.streaming.dstream.DStream[SensorData] = org.apache.spark.streaming.dstream.TransformedDStream@c86ef01


# Prepare the stream to train our model

In [ ]:
@transient val inputData = sensorDataStream.transform {sensorDataRDD =>  sensorDataRDD.map{case SensorData(id,ts,value) => (id, value)}}                                                            

inputData: org.apache.spark.streaming.dstream.DStream[(String, Double)] = org.apache.spark.streaming.dstream.TransformedDStream@58e6ae53


## Use the data to train the model

In [ ]:
model.trainOn(inputData)

## Score the streaming data using the trained model

In [ ]:
@transient val scored = model.predictOnValues(inputData)

scored: org.apache.spark.streaming.dstream.DStream[(String, Double, Double, Double)] = org.apache.spark.streaming.dstream.FlatMappedDStream@62bc6a9e


In [ ]:
val scatterChart = new ScatterChart(Seq((0.0,0.0)))
scatterChart


scatterChart: notebook.front.widgets.charts.ScatterChart[Seq[(Double, Double)]] = <ScatterChart widget>
res20: notebook.front.widgets.charts.ScatterChart[Seq[(Double, Double)]] = <ScatterChart widget>


In [ ]:
scored.foreachRDD{rdd =>
  val data = rdd.collect.map{case (id, value, mean, std) => (value, std)}
  scatterChart.applyOn(data)
}

In [ ]:
val outputBox = ul(20)
outputBox.append(".")
outputBox

outputBox: notebook.front.widgets.HtmlList = <HtmlList widget>
res26: notebook.front.widgets.HtmlList = <HtmlList widget>


In [ ]:
val debugBox = ul(20)
debugBox

debugBox: notebook.front.widgets.HtmlList = <HtmlList widget>
res28: notebook.front.widgets.HtmlList = <HtmlList widget>


In [ ]:
@transient val suspects = scored.filter{case (id, value, mean, std) => (value > mean + std * threshold) || (value < mean - std * threshold) }

suspects: org.apache.spark.streaming.dstream.DStream[(String, Double, Double, Double)] = org.apache.spark.streaming.dstream.FilteredDStream@250687d7


In [ ]:
suspects.foreachRDD{rdd => 
                    val sample = rdd.take(20).map(_.toString)
                    val total = s"total found: ${rdd.count}"
                    outputBox(total +: sample)
                    
                   }                  

In [ ]:
inputData.foreachRDD{rdd => 
                    val sample = rdd.take(20).map(_.toString)
                    debugBox.appendAll(sample)
                   } 

In [ ]:
streamingContext.start()

In [ ]:
// Be careful not to stop the context if you want the streaming process to continue
streamingContext.stop(false)

In [ ]:
entries("dth-001").stdev

res37: Option[Double] = Some(0.1356008026337026)


In [ ]:
entries("dth-001")

res39: M2 = M2(19,21.931393522267214,0.33132288867019494)
