#Anomaly Detection Using Controlled Streams
This notebook uses the exported M2 Model by Spark Streaming and combines it with Structured Streaming to deliver low-latency anomaly detection on the raw data stream.

## Common settings

In [ ]:
import org.apache.spark.streaming.Seconds

val topic = "sensor-raw"
val modelTopic = "modelTopic"
val anomalyTopic = "anomalyTopic"
val kafkaBootstrapServer = "172.17.0.2:9092"
val threshold = 4.0 // 5% failure rate
val modelRefreshInterval = Seconds(30)

import org.apache.spark.streaming.Seconds
topic: String = sensor-raw
modelTopic: String = modelTopic
anomalyTopic: String = anomalyTopic
kafkaBootstrapServer: String = 172.17.0.2:9092
threshold: Double = 4.0
modelRefreshInterval: org.apache.spark.streaming.Duration = 30000 ms


In [ ]:
:sh rm -rf /tmp/spark/detection/checkpoint


import sys.process._




## Case class and Schema definitions
(we have seen this schema definition already in [sensor-anomaly-detection-model](./sensor-anomaly-detection-model-serving.snb.ipynb))

In [ ]:
case class M2(n:Int, mean: Double, m2:Double) {
  def variance: Option[Double] = {
    if (n<2) None else Some(m2/(n-1))
  }
  def stdev: Option[Double] = variance.map(Math.sqrt)
}
case class IdM2(id:String, m2: M2)
case class SensorData(id: String, ts: Long, value: Double)

defined class M2
defined class IdM2
defined class SensorData


In [ ]:
import org.apache.spark.sql.Encoders
val idM2Schema = Encoders.product[IdM2].schema
val sensorSchema = Encoders.product[SensorData].schema

import org.apache.spark.sql.Encoders
idM2Schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(m2,StructType(StructField(n,IntegerType,false), StructField(mean,DoubleType,false), StructField(m2,DoubleType,false)),true))
sensorSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(value,DoubleType,false))


## Read the model Stream using Spark Streaming

In [ ]:
import org.apache.spark.streaming.StreamingContext
@transient val streamingContext = new StreamingContext(sparkSession.sparkContext, modelRefreshInterval)

import org.apache.spark.streaming.StreamingContext
streamingContext: org.apache.spark.streaming.StreamingContext = org.apache.spark.streaming.StreamingContext@3095bc91


## Spark Streaming Kafka Source

In [ ]:
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._

val kafkaParams = Map[String, String](
  "metadata.broker.list" -> kafkaBootstrapServer,
  "group.id" -> "model-serving-group",
  "auto.offset.reset" -> "largest"
)

val topics = Set(modelTopic)
// There's a deprecation warning here. The Spark Notebook doesn't support Kafka 0.10 integration yet.
@transient val modelStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
     streamingContext, kafkaParams, topics)


       @transient val modelStream = KafkaUtils.createDirectStream[String, String, StringDecoder, StringDecoder](
                                    ^
import org.apache.kafka.clients.consumer.ConsumerRecord
import kafka.serializer.StringDecoder
import org.apache.spark.streaming.kafka._
kafkaParams: scala.collection.immutable.Map[String,String] = Map(metadata.broker.list -> 172.17.0.2:9092, group.id -> model-serving-group, auto.offset.reset -> largest)
topics: scala.collection.immutable.Set[String] = Set(modelTopic)
modelStream: org.apache.spark.streaming.dstream.InputDStream[(String, String)] = org.apache.spark.streaming.kafka.DirectKafkaInputDStream@5fa0d032


## Structured Streaming Kafka Source

In [ ]:
val rawData = sparkSession.readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", kafkaBootstrapServer)
  .option("subscribe", topic)
  .option("startingOffsets", "latest")
  .load()
val rawValues = rawData.selectExpr("CAST(value AS STRING)").as[String]
val jsonValues = rawValues.select(from_json($"value", sensorSchema) as "record")
val sensorData = jsonValues.select("record.*").as[SensorData]

rawData: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]
rawValues: org.apache.spark.sql.Dataset[String] = [value: string]
jsonValues: org.apache.spark.sql.DataFrame = [record: struct<id: string, ts: bigint ... 1 more field>]
sensorData: org.apache.spark.sql.Dataset[SensorData] = [id: string, ts: bigint ... 1 more field]


In [ ]:
var query: org.apache.spark.sql.streaming.StreamingQuery = _

query: org.apache.spark.sql.streaming.StreamingQuery = null


In [ ]:
@transient val modelBox = ul(10)


modelBox: notebook.front.widgets.HtmlList = <HtmlList widget>


## Start Structured Streaming  Processing using Spark Streaming as Stream Flow


In [ ]:
import org.apache.spark.sql.functions._

// ****************** Spark Streaming *********************
modelStream.foreachRDD{ rdd =>
  if (!rdd.isEmpty) {
    
    // ****************** Spark Streaming *********************
    
    // Extract the new model parameters received through Kafka
    val models = rdd.map{case (k,v) => v}.toDF("value")
    val mostRecentM2JsonModel = models.select(from_json($"value", idM2Schema) as "record")
    val mostRecentM2Model = mostRecentM2JsonModel.select("record.*").as[IdM2]
    val m2Map = mostRecentM2Model.collect.map(idM2=> (idM2.id, idM2.m2)).toMap
    modelBox.appendAll(mostRecentM2Model.take(10).map(_.toString))
    
    
    // ****************** Structured Streaming *****************
    
    // create a stream of scored data
    val scoreStream = sensorData.flatMap{case SensorData(id, ts, value) => 
                                     val m2Opt = m2Map.get(id)
                                     m2Opt.map{m2 => (id, ts, value, m2.mean, m2.stdev)}
                                    }.toDF("id", "ts","value","mean","stdev")
    
    // filter suspected anomalies
    val anomalies = scoreStream.where($"value" > ($"mean"+$"stdev"*threshold))
                               .select($"id" as "key", to_json(struct($"id",$"ts", $"value", $"stdev")) as "value" )
    
    if (query != null) {
      query.stop()
    }
    
    // write the data back to Kafka
    query = anomalies.writeStream
      .format("kafka")
      .queryName("anomalyStreamProducer")
      //.trigger(Trigger.Continuous("10 second"))
      .outputMode("append") 
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("topic", anomalyTopic)
      .option("checkpointLocation", "/tmp/spark/detection/checkpoint")
      .option("failOnDataLoss", "false")
    .start()
    
  }
}

    

import org.apache.spark.sql.functions._


In [ ]:
streamingContext.start()

In [ ]:
// Data outout widget
modelBox

res15: notebook.front.widgets.HtmlList = <HtmlList widget>


In [ ]:
// streamingContext.stop(false)

In [ ]:
// uncomment to debug progress
query.lastProgress

java.lang.NullPointerException
  ... 69 elided
