#Real-Time Anomaly Detection Using Continuous Processing
This notebook uses the exported M2 Model by Spark Streaming and combines it with a Continuous Processing job in Structured Streaming to deliver real-time anomaly detection on the raw data stream.

## Common settings

In [ ]:
import org.apache.spark.streaming.Duration

import org.apache.spark.streaming.Duration


In [ ]:
val topic = "sensor-raw"
val modelTopic = "modelTopic"
val anomalyTopic = "anomalyTopic"
val kafkaBootstrapServer = "172.17.0.2:9092"
val threshold = 4.0
val targetDir = "/tmp/anomaly/model"
val modelRefreshInterval = Duration(30000)

topic: String = sensor-raw
modelTopic: String = modelTopic
anomalyTopic: String = anomalyTopic
kafkaBootstrapServer: String = 172.17.0.2:9092
threshold: Double = 4.0
targetDir: String = /tmp/anomaly/model
modelRefreshInterval: org.apache.spark.streaming.Duration = 30000 ms


## Case class and Schema definitions
(we have seen this already)

In [ ]:
case class M2(n:Int, mean: Double, m2:Double) {
  def variance: Option[Double] = {
    if (n<2) None else Some(m2/(n-1))
  }
  def stdev: Option[Double] = variance.map(Math.sqrt)
}
case class IdM2(id:String, m2: M2)
case class SensorData(id: String, ts: Long, temp: Double, hum: Double)

defined class M2
defined class IdM2
defined class SensorData


In [ ]:
import org.apache.spark.sql.Encoders
val idM2Schema = Encoders.product[IdM2].schema
val sensorSchema = Encoders.product[SensorData].schema

import org.apache.spark.sql.Encoders
idM2Schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(m2,StructType(StructField(n,IntegerType,false), StructField(mean,DoubleType,false), StructField(m2,DoubleType,false)),true))
sensorSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(temp,DoubleType,false), StructField(hum,DoubleType,false))


## Read the model Stream using Spark Streaming

In [ ]:
import org.apache.spark.streaming.StreamingContext
@transient val streamingContext = new StreamingContext(sparkSession.sparkContext, modelRefreshInterval)

import org.apache.spark.streaming.StreamingContext
streamingContext: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@2b97d30a


In [ ]:
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._

val kafkaParams = Map[String, String](
  "metadata.broker.list" -> kafkaBootstrapServer,
  "group.id" -> "model-serving-group",
  "auto.offset.reset" -> "largest"
)

val topics = Set(modelTopic)
@transient val modelStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
     streamingContext, kafkaParams, topics)

       @transient val modelStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
                                    ^
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._
kafkaParams: scala.collection.immutable.Map[String,String] = Map(metadata.broker.list -> 172.17.0.2:9092, group.id -> model-serving-group, auto.offset.reset -> largest)
topics: scala.collection.immutable.Set[String] = Set(modelTopic)
modelStream: org.apache.spark.streaming.dstream.InputDStream[(String, String)] = org.apache.spark.streaming.kafka.DirectKafkaInputDStream@3541ab5c


In [ ]:
var query: org.apache.spark.sql.streaming.StreamingQuery = _

query: org.apache.spark.sql.streaming.StreamingQuery = null


## Start Structured Streaming Continuous Processing using Spark Streaming


In [ ]:
import org.apache.spark.sql.functions._
modelStream.foreachRDD{ rdd =>
  if (!rdd.isEmpty) {
    // Extract the new model parameters
    val models = rdd.map{case (k,v) => v}.toDF("value")
    val mostRecentM2JsonModel = models.select(from_json($"value", idM2Schema) as "record")
    val mostRecentM2Model = mostRecentM2JsonModel.select("record.*").as[IdM2]
    val m2Map = mostRecentM2Model.collect.map(idM2=> (idM2.id, idM2.m2)).toMap
    
    // Stop the continuous query, if running
    if (query != null) {
      query.stop()
    }
    
    // Configure the scoring query with the new model parameters
    val rawData = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", topic)
      //.option("checkpointDir", "/tmp/model/checkpoint")
      .option("startingOffsets", "earliest")
      .load()
    val rawValues = rawData.selectExpr("CAST(value AS STRING)").as[String]
    val jsonValues = rawValues.select(from_json($"value", sensorSchema) as "record")
    val sensorData = jsonValues.select("record.*").as[SensorData]
    val scoreStream = sensorData.flatMap{case SensorData(id, ts, temp, hum) => 
                                     val m2Opt = m2Map.get(id)
                                     m2Opt.map{m2 => (id, ts, temp, m2.mean, m2.stdev)}
                                    }.toDF("id", "ts","temp","mean","std")
    
    // Apply the stdev model
    val anomalies = scoreStream.where($"temp" > $"mean"+$"std"*threshold)
    .select($"id" as "key", to_json(struct($"id",$"ts", $"temp")) as "value" )
    import org.apache.spark.sql.streaming.Trigger
    
    // write the data back to Kafka
    query = anomalies.writeStream
      .format("kafka")
      .queryName("continuousStreamDetection")
      .trigger(Trigger.Continuous("10 second"))
      .outputMode("append") 
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("topic", anomalyTopic)
      .option("checkpointLocation", "/tmp/spark/checkpoint-a1")
      .option("failOnDataLoss", "false")
    .start()
  }
}

    

import org.apache.spark.sql.functions._


In [ ]:
streamingContext.start()

In [ ]:
//streamingContext.stop(false)

## Read the Anomaly Stream Back From Kafka

In [ ]:
val anomalyDataStream = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", anomalyTopic)
      .option("startingOffsets", "latest")
      .load()

anomalyDataStream: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [ ]:
case class AnomalyReport(id: String, ts: Long, temp: Double)
import org.apache.spark.sql.Encoders
val schema = Encoders.product[AnomalyReport].schema

defined class AnomalyReport
import org.apache.spark.sql.Encoders
schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(temp,DoubleType,false))


In [ ]:
val rawValues = anomalyDataStream.selectExpr("CAST(value AS STRING)").as[String]
val jsonValues = rawValues.select(from_json($"value", schema) as "record")
val anomalyData = jsonValues.select("record.*").as[AnomalyReport]

rawValues: org.apache.spark.sql.Dataset[String] = [value: string]
jsonValues: org.apache.spark.sql.DataFrame = [record: struct<id: string, ts: bigint ... 1 more field>]
anomalyData: org.apache.spark.sql.Dataset[AnomalyReport] = [id: string, ts: bigint ... 1 more field]


In [ ]:
import org.apache.spark.sql.types._
val toSeconds = udf((ts:Long) => ts/1000)
val anomalyReport = anomalyData.withColumn("timestamp", toSeconds($"ts").cast(TimestampType))
                                          .withWatermark("timestamp", "0 second")
                                          .groupBy($"id", window($"timestamp", "1 minute", "1 second"))
                                          .agg(count($"id") as "count", max($"temp") as "temp")
                                          

import org.apache.spark.sql.types._
toSeconds: org.apache.spark.sql.expressions.UserDefinedFunction = UserDefinedFunction(<function1>,LongType,Some(List(LongType)))
anomalyReport: org.apache.spark.sql.DataFrame = [id: string, window: struct<start: timestamp, end: timestamp> ... 2 more fields]


In [ ]:
val memQuery = anomalyReport.writeStream
           .format("memory")
           .queryName("anomalyMemReport")
           .outputMode("append")
           .start()

memQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@68468075


## Anomalies Chart

In [ ]:
case class Bubble(id: String, count: Int, temp: Double, pos: Int = 0, color: String = "red")

defined class Bubble


In [ ]:
val bubbles = Seq(Bubble("zero",0, 1, 0, "black"), Bubble("zero",1, 1000, 100, "black")))
val bubbleChart = CustomPlotlyChart(bubbles, 
                  layout="{title: 'Anomaly Board', showlegend: false, height: 1000, width: 1000}",
                  dataOptions="{mode: 'markers'}",
                  dataSources="{x: 'pos', y: 'temp',text: 'id', marker: {size: 'count', color: 'color'}}")

bubbles: Seq[Bubble] = List(Bubble(zero,0,1.0,0,black))
bubbleChart: notebook.front.widgets.charts.CustomPlotlyChart[Seq[Bubble]] = <CustomPlotlyChart widget>


In [ ]:
@volatile var running = true

running: Boolean = true


In [ ]:
bubbleChart

res26: notebook.front.widgets.charts.CustomPlotlyChart[Seq[Bubble]] = <CustomPlotlyChart widget>


In [ ]:
import scala.concurrent.duration._
import scala.annotation.tailrec

val updater = new Thread() {
  @tailrec
  def visualize(): Unit = {
    val data = sparkSession.sql(s"select * from anomalyMemReport")
    val indexedData = data.withColumn("pos", lit(1)).withColumn("color", lit("red")).orderBy($"window.start".desc)
                          .withColumn("count", $"count".cast(IntegerType))
    val bubbleData = indexedData.as[Bubble].take(50).sortBy(_.id).zipWithIndex
        .map{case (bubble,idx) => bubble.copy(pos=idx, count = (bubble.temp * (1+bubble.count/10)).toInt)}
    val filteredBubbleData = bubbleData.groupBy(_.id).mapValues{bubbles => bubbles.sortBy(b => -b.count).head}.values.toList
    
    if (filteredBubbleData.nonEmpty) bubbleChart.applyOn(filteredBubbleData)
    if (running) {
      Thread.sleep(1.second.toMillis)
      visualize()
    } else ()
  } 
  
  override def run() {
    visualize()
  }
}.start()


In [ ]:
--

In [ ]:
// execute to stop the chart updating thread
running = false

running: Boolean = false


# -- o --

In [ ]:
memTable.where($"id" === "office")

res150: org.apache.spark.sql.Dataset[(String, Long, Double)] = [id: string, ts: bigint ... 1 more field]
