#Real-Time Anomaly Detection Using Continuous Processing
This notebook uses the exported M2 Model by Spark Streaming and combines it with a Continuous Processing job in Structured Streaming to deliver real-time anomaly detection on the raw data stream.

## Common settings

In [ ]:
import org.apache.spark.streaming.Duration

import org.apache.spark.streaming.Duration


In [ ]:
val topic = "sensor-raw"
val modelTopic = "modelTopic"
val anomalyTopic = "anomalyTopic"
val kafkaBootstrapServer = "172.17.0.2:9092"
val threshold = 4.0
val targetDir = "/tmp/anomaly/model"
val interval = Duration(30000)

topic: String = sensor-raw
modelTopic: String = modelTopic
kafkaBootstrapServer: String = 172.17.0.2:9092
threshold: Double = 4.0
targetDir: String = /tmp/anomaly/model
interval: org.apache.spark.streaming.Duration = 30000 ms


## Case class and Schema definitions

In [ ]:
case class M2(n:Int, mean: Double, m2:Double) {
  def variance: Option[Double] = {
    if (n<2) None else Some(m2/(n-1))
  }
  def stdev: Option[Double] = variance.map(Math.sqrt)
}
case class IdM2(id:String, m2: M2)
case class SensorData(id: String, ts: Long, temp: Double, hum: Double)

defined class M2
defined class IdM2
defined class SensorData


In [ ]:
import org.apache.spark.sql.Encoders
val idM2Schema = Encoders.product[IdM2].schema
val sensorSchema = Encoders.product[SensorData].schema

import org.apache.spark.sql.Encoders
idM2Schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(m2,StructType(StructField(n,IntegerType,false), StructField(mean,DoubleType,false), StructField(m2,DoubleType,false)),true))
sensorSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(temp,DoubleType,false), StructField(hum,DoubleType,false))


## Read the model Stream from Kafka using Spark Streaming

In [ ]:
import org.apache.spark.streaming.StreamingContext
@transient val streamingContext = new StreamingContext(sparkSession.sparkContext, interval)

import org.apache.spark.streaming.StreamingContext
streamingContext: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@4416af39


In [ ]:
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._

val kafkaParams = Map[String, String](
  "metadata.broker.list" -> kafkaBootstrapServer,
  "group.id" -> "model-serving-group",
  "auto.offset.reset" -> "largest"
)

val topics = Set(modelTopic)
@transient val modelStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
     streamingContext, kafkaParams, topics)

       @transient val modelStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
                                    ^
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._
kafkaParams: scala.collection.immutable.Map[String,String] = Map(metadata.broker.list -> 172.17.0.2:9092, group.id -> model-serving-group, auto.offset.reset -> largest)
topics: scala.collection.immutable.Set[String] = Set(modelTopic)
modelStream: org.apache.spark.streaming.dstream.InputDStream[(String, String)] = org.apache.spark.streaming.kafka.DirectKafkaInputDStream@53f2ab73


In [ ]:
var query: org.apache.spark.sql.DataFrame = _

rawData: org.apache.spark.sql.DataFrame = null


In [ ]:
import org.apache.spark.sql.functions._
modelStream.foreachRDD{ rdd =>
  if (rdd.nonEmpty) {
    val models = rdd.map{case (k,v) => v}.toDF("value")
    val mostRecentM2JsonModel = models.select(from_json($"value", idM2Schema)).as[IdM2]
    val m2Map = mostRecentM2JsonModel.collect.map(idM2=> (idM2.id, idM2.m2)).toMap
    
    if (query != null) {
      query.stop()
    }
    val rawData = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", topic)
      .option("checkpointDir", "/tmp/checkpoint")
      .option("startingOffsets", "earliest")
      .load()
    val rawValues = rawData.selectExpr("CAST(value AS STRING)").as[String]
    val jsonValues = rawValues.select(from_json($"value", sensorSchema) as "record")
    val sensorData = jsonValues.select("record.*").as[SensorData]
    val scoreStream = sensorData.flatMap{case SensorData(id, ts, temp, hum) => 
                                     val m2Opt = m2Map.get(id)
                                     m2Opt.map{m2 => (id, ts, temp, m2.mean, m2.stdev)}
                                    }.toDF("id", "ts","temp","mean","std")
    
    val anomalies = scoreStream.where($"temp" > $"mean"+$"std"*threshold or $"temp" < $"mean"-$"std"*threshold)
    import org.apache.spark.sql.streaming.Trigger
    val query = anomalies.writeStream
      .format("kafka")
      .queryName("continuousStreamDetection")
      .trigger(Trigger.Continuous("1 second"))
      .outputMode("append") 
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("topic", anomalyTopic)
      .option("checkpointLocation", "/tmp/spark/checkpoint-a1")
      .option("failOnDataLoss", "false")
    .start()
  }
}

    

In [ ]:
streamingContext.start()

## Read the Anomaly Stream Back From Kafka

In [ ]:
val rawData = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", anomalyTopic)
      .option("startingOffsets", "latest")
      .load()

rawData: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [ ]:
case class AnomalyReport(id: String, ts: Long, temp: Double,mean: Double, std:Double)

In [ ]:
val rawValues = rawData.selectExpr("CAST(value AS STRING)").as[String]
val jsonValues = rawValues.select(from_json($"value", sensorSchema) as "record")
val sensorData = jsonValues.select("record.*").as[SensorData]

rawValues: org.apache.spark.sql.Dataset[String] = [value: string]
jsonValues: org.apache.spark.sql.DataFrame = [record: struct<id: string, ts: bigint ... 2 more fields>]
sensorData: org.apache.spark.sql.Dataset[SensorData] = [id: string, ts: bigint ... 2 more fields]


## ----- We have already seen a similar process. Here comes the interesting part -----

## Augment the data with its estimated mean an stdev

In [ ]:
val scoreStream = sensorData.flatMap{case SensorData(id, ts, temp, hum) => 
                                     val m2Opt = m2Map.get(id)
                                     m2Opt.map{m2 => (id, ts, temp, m2.mean, m2.stdev)}
                                    }.toDF("id", "ts","temp","mean","std")
                                 

scoreStream: org.apache.spark.sql.DataFrame = [id: string, ts: bigint ... 3 more fields]


## Filter out outliers suspected to be anomalies

In [ ]:
import org.apache.spark.sql.functions._
val suspects = scoreStream.where($"temp" > $"mean"+$"std"*threshold or $"temp" < $"mean"-$"std"*threshold)

import org.apache.spark.sql.functions._
suspects: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, ts: bigint ... 3 more fields]


In [ ]:
import org.apache.spark.sql.streaming.Trigger
val query = suspects.writeStream
                    .format("memory")
                    .queryName("continuousStreamDetection")
                    .trigger(Trigger.Continuous("1 second"))
                    .start() 

import org.apache.spark.sql.streaming.Trigger
query: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@5be2d995


In [ ]:
//query.stop

## Let's do some Visualization

In [ ]:
val dummy = Seq(("id", 0.0))

val chart = CustomPlotlyChart(dummy,
                  layout=s"{title: 'sensor anomaly indicator'}",
                  dataOptions="""{type: 'bar'}""",
                  dataSources="{x: '_1', y: '_2' }")
chart

dummy: Seq[(String, Double)] = List((id,0.0))
chart: notebook.front.widgets.charts.CustomPlotlyChart[Seq[(String, Double)]] = <CustomPlotlyChart widget>
res15: notebook.front.widgets.charts.CustomPlotlyChart[Seq[(String, Double)]] = <CustomPlotlyChart widget>


In [ ]:
@volatile var running = true

running: Boolean = true


In [ ]:
import scala.concurrent.duration._
import scala.annotation.tailrec

val updater = new Thread() {
  @tailrec
  def visualize(): Unit = {
    val currentTimeThreshold = System.currentTimeMillis - 5*1000
    val data = sparkSession.sql(s"select id, temp from continuousStreamDetection where ts > $currentTimeThreshold")
                           .as[(String, Double)]
                           .collect
    if (data.nonEmpty) chart.applyOn(data)
    if (running) {
      Thread.sleep(1.second.toMillis)
      visualize()
    } else ()
  } 
  
  override def run() {
    visualize()
  }
}.start()


import scala.concurrent.duration._
import scala.annotation.tailrec
updater: Unit = ()


In [ ]:
--

In [ ]:
// execute to stop the chart updating thread
running = false

running: Boolean = false


# -- o --

In [ ]:
memTable.where($"id" === "office")

res150: org.apache.spark.sql.Dataset[(String, Long, Double)] = [id: string, ts: bigint ... 1 more field]
