#Real-Time Anomaly Detection Using Continuous Processing
This notebook uses the exported M2 Model by Spark Streaming and combines it with a Continuous Processing job in Structured Streaming to deliver real-time anomaly detection on the raw data stream.

## Common settings

In [ ]:
val topic = "sensor-raw"
val kafkaBootstrapServer = "172.17.0.2:9092"
val threshold = 4.0
val targetDir = "/tmp/anomaly/model"

topic: String = sensor-raw
kafkaBootstrapServer: String = 172.17.0.2:9092
threshold: Double = 4.0
targetDir: String = /tmp/anomaly/model


## We locate the most recent model in the target directory

In [ ]:
import java.io.File
import scala.collection.JavaConverters._
val availableFiles = new File(targetDir).listFiles.map(_.getName).filter(_.endsWith(".json"))
val mostRecent = availableFiles.sortBy{f => 
                   val timestamp = f.split("-").last.dropRight(".json".length)
                   -timestamp.toLong
                  }.head


import java.io.File
import scala.collection.JavaConverters._
availableFiles: Array[String] = Array(sensors-m2-1536725530000.json, sensors-m2-1536724670000.json, sensors-m2-1536723670000.json, sensors-m2-1536725890000.json, sensors-m2-1536722670000.json, sensors-m2-1536722560000.json, sensors-m2-1536723280000.json, sensors-m2-1536722920000.json, sensors-m2-1536725720000.json, sensors-m2-1536723470000.json, sensors-m2-1536725970000.json, sensors-m2-1536724500000.json, sensors-m2-1536723720000.json, sensors-m2-1536725860000.json, sensors-m2-1536723020000.json, sensors-m2-1536724860000.json, sensors-m2-1536724510000.json, sensors-m2-1536724080000.json, sensors-m2-1536725040000.json, sensors-m2-1536723340000.json, sensors-m2-1536722820000.json, sensors-m2-1536723110000.json, sensors-m2-15367...

## Case class and Schema definitions

In [ ]:
case class M2(n:Int, mean: Double, m2:Double) {
  def variance: Option[Double] = {
    if (n<2) None else Some(m2/(n-1))
  }
  def stdev: Option[Double] = variance.map(Math.sqrt)
}
case class IdM2(id:String, m2: M2)
case class SensorData(id: String, ts: Long, temp: Double, hum: Double)

defined class M2
defined class IdM2
defined class SensorData


In [ ]:
import org.apache.spark.sql.Encoders
val idM2Schema = Encoders.product[IdM2].schema
val sensorSchema = Encoders.product[SensorData].schema

import org.apache.spark.sql.Encoders
idM2Schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(m2,StructType(StructField(n,IntegerType,false), StructField(mean,DoubleType,false), StructField(m2,DoubleType,false)),true))
sensorSchema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(temp,DoubleType,false), StructField(hum,DoubleType,false))


## Parse the JSON Data

In [ ]:
val mostRecentM2JsonModel = sparkSession.read.schema(idM2Schema).json(s"$targetDir/$mostRecent").as[IdM2]


mostRecentM2JsonModel: org.apache.spark.sql.Dataset[IdM2] = [id: string, m2: struct<n: int, mean: double ... 1 more field>]


In [ ]:
val m2Map = mostRecentM2JsonModel.collect.map(idM2=> (idM2.id, idM2.m2)).toMap

m2Map: scala.collection.immutable.Map[String,M2] = Map(sim-614 -> M2(373,22.347265415549593,123.51141072386125), sim-183 -> M2(373,22.398713136729228,104.37838230562983), sim-129 -> M2(373,22.61804289544236,156.14367131367314), sim-486 -> M2(373,26.232922252010734,505.6961147453083), sim-189 -> M2(373,10.730911528150138,174.89929008042895), sim-352 -> M2(373,13.18957104557641,86.40493136729171), sim-674 -> M2(373,28.456005361930302,460.75154798927633), sim-358 -> M2(373,22.31747989276138,106.62643109919601), sim-392 -> M2(373,22.57742627345844,111.48352922252042), sim-849 -> M2(373,22.750723860589794,107.1187045576406), sim-178 -> M2(373,23.37080428954424,184.38375871313767), sim-875 -> M2(373,13.945040214477206,95.99332439678311), sim-516 -> M2(373,22.43932975871313,107.5289324396787),...

## Read the Raw Stream from Kafka

In [ ]:
val rawData = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", topic)
      .option("startingOffsets", "latest")
      .load()

rawData: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [ ]:
val rawValues = rawData.selectExpr("CAST(value AS STRING)").as[String]
val jsonValues = rawValues.select(from_json($"value", sensorSchema) as "record")
val sensorData = jsonValues.select("record.*").as[SensorData]

rawValues: org.apache.spark.sql.Dataset[String] = [value: string]
jsonValues: org.apache.spark.sql.DataFrame = [record: struct<id: string, ts: bigint ... 2 more fields>]
sensorData: org.apache.spark.sql.Dataset[SensorData] = [id: string, ts: bigint ... 2 more fields]


## Apply the scoring process

In [ ]:
val scoreStream = sensorData.flatMap{case SensorData(id, ts, temp, hum) => 
                                     val m2Opt = m2Map.get(id)
                                     m2Opt.map{m2 => (id, ts, temp, m2.mean, m2.stdev)}
                                    }.toDF("id", "ts","temp","mean","std")
                                 

scoreStream: org.apache.spark.sql.DataFrame = [id: string, ts: bigint ... 3 more fields]


In [ ]:
import org.apache.spark.sql.functions._
val suspects = scoreStream.where($"temp" > $"mean"+$"std"*threshold or $"temp" < $"mean"-$"std"*threshold)

import org.apache.spark.sql.functions._
suspects: org.apache.spark.sql.Dataset[org.apache.spark.sql.Row] = [id: string, ts: bigint ... 3 more fields]


In [ ]:
import org.apache.spark.sql.streaming.Trigger
val query = suspects.writeStream
                    .format("memory")
                    .queryName("continuousStreamDetection")
                    .trigger(Trigger.Continuous("1 second"))
                    .start() 

import org.apache.spark.sql.streaming.Trigger
query: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@6b8ed1e6


In [ ]:
query.stop

## Let's do some Visualization

In [ ]:
val dummy = Seq(("id", 0.0))

val chart = CustomPlotlyChart(dummy,
                  layout=s"{title: 'sensor anomaly indicator'}",
                  dataOptions="""{type: 'bar'}""",
                  dataSources="{x: '_1', y: '_2' }")
chart

dummy: Seq[(String, Double)] = List((id,0.0))
chart: notebook.front.widgets.charts.CustomPlotlyChart[Seq[(String, Double)]] = <CustomPlotlyChart widget>
res12: notebook.front.widgets.charts.CustomPlotlyChart[Seq[(String, Double)]] = <CustomPlotlyChart widget>


In [ ]:
@volatile var running = true

running: Boolean = true


In [ ]:
import scala.concurrent.duration._
import scala.annotation.tailrec

val updater = new Thread() {
  @tailrec
  def visualize(): Unit = {
    val currentTimeThreshold = System.currentTimeMillis - 5*1000
    val data = sparkSession.sql(s"select id, temp from continuousStreamDetection where ts > $currentTimeThreshold")
                           .as[(String, Double)]
                           .collect
    if (data.nonEmpty) chart.applyOn(data)
    if (running) {
      Thread.sleep(1.second.toMillis)
      visualize()
    } else ()
  } 
  
  override def run() {
    visualize()
  }
}.start()


import scala.concurrent.duration._
import scala.annotation.tailrec
updater: Unit = ()


In [ ]:
// execute to stop the chart updating thread
running = false

running: Boolean = false


# -- o --

In [ ]:
memTable.where($"id" === "office")

res150: org.apache.spark.sql.Dataset[(String, Long, Double)] = [id: string, ts: bigint ... 1 more field]
