
#Anomaly Detection Visualization
In this notebook, we consume the anomaly detection topic to display the potentially anomalous sensors.

## Common settings

In [ ]:
import org.apache.spark.streaming.Seconds

val topic = "sensor-raw"
val modelTopic = "modelTopic"
val anomalyTopic = "anomalyTopic"
val kafkaBootstrapServer = "172.17.0.2:9092"
val threshold = 4.0 // 5% failure rate
val modelRefreshInterval = Seconds(30)

import org.apache.spark.streaming.Seconds
topic: String = sensor-raw
modelTopic: String = modelTopic
anomalyTopic: String = anomalyTopic
kafkaBootstrapServer: String = 172.17.0.2:9092
threshold: Double = 4.0
modelRefreshInterval: org.apache.spark.streaming.Duration = 30000 ms


In [ ]:
val anomalyDataStream = sparkSession.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", kafkaBootstrapServer)
      .option("subscribe", anomalyTopic)
      .option("startingOffsets", "latest")
      .load()

anomalyDataStream: org.apache.spark.sql.DataFrame = [key: binary, value: binary ... 5 more fields]


In [ ]:
case class AnomalyReport(id: String, ts: Long, value: Double, stdev: Double)
import org.apache.spark.sql.Encoders
val schema = Encoders.product[AnomalyReport].schema

defined class AnomalyReport
import org.apache.spark.sql.Encoders
schema: org.apache.spark.sql.types.StructType = StructType(StructField(id,StringType,true), StructField(ts,LongType,false), StructField(value,DoubleType,false), StructField(stdev,DoubleType,false))


In [ ]:
val rawValues = anomalyDataStream.selectExpr("CAST(value AS STRING)").as[String]
val jsonValues = rawValues.select(from_json($"value", schema) as "record")
val anomalyData = jsonValues.select("record.*").as[AnomalyReport]

rawValues: org.apache.spark.sql.Dataset[String] = [value: string]
jsonValues: org.apache.spark.sql.DataFrame = [record: struct<id: string, ts: bigint ... 2 more fields>]
anomalyData: org.apache.spark.sql.Dataset[AnomalyReport] = [id: string, ts: bigint ... 2 more fields]


In [ ]:
val memQuery = anomalyData.writeStream
           .format("memory")
           .queryName("anomalyMemReport")
           .outputMode("append")
           .start()

memQuery: org.apache.spark.sql.streaming.StreamingQuery = org.apache.spark.sql.execution.streaming.StreamingQueryWrapper@ff26c82


In [ ]:
val anomalyMemData = sparkSession.sql("select * from anomalyMemReport")

anomalyMemData: org.apache.spark.sql.DataFrame = [id: string, ts: bigint ... 2 more fields]


In [ ]:
anomalyMemData

res20: org.apache.spark.sql.DataFrame = [id: string, ts: bigint ... 2 more fields]


## Anomalies Chart
Let's create a chart of the suspected anomalies

In [ ]:
case class Bubble(id: String, size: Int, value: Double, pos: Int = 0, color: String = "red")

defined class Bubble


In [ ]:
val bubbles = Seq(Bubble("zero",0, 1, 0, "black"), Bubble("zero",1, 1000, 100, "black"))
val bubbleChart = CustomPlotlyChart(bubbles, 
                  layout="{title: 'Anomaly Board', showlegend: false, height: 800, width: 1000}",
                  dataOptions="{mode: 'markers'}",
                  dataSources="{x: 'pos', y: 'value',text: 'id', marker: {size: 'size', color: 'color'}}")

bubbles: Seq[Bubble] = List(Bubble(zero,0,1.0,0,black), Bubble(zero,1,1000.0,100,black))
bubbleChart: notebook.front.widgets.charts.CustomPlotlyChart[Seq[Bubble]] = <CustomPlotlyChart widget>


In [ ]:
@transient var running = true

running: Boolean = true


In [ ]:
import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.concurrent.duration._
import scala.annotation.tailrec

val updater = new Thread() {
  @tailrec
  def visualize(): Unit = {
    val lastMinute: Long = System.currentTimeMillis - 1.minute.toMillis
    
    val data = anomalyMemData.select($"id",$"ts".cast(LongType) as "timestamp", $"value", $"stdev")
                   .where($"timestamp" > lastMinute)
                   .orderBy($"timestamp")
    val indexedData = data.withColumn("pos", lit(1)).withColumn("color", lit("red"))
                          .withColumn("size", ($"stdev"*20+50).cast(IntegerType))
    val bubbleData = indexedData.as[Bubble].collect()
                            .groupBy(_.id)
                            .mapValues(bubbles => bubbles.sortBy(b => -b.size).head)
                            .values
                            .toList
                            .sortBy(_.id)
                            .zipWithIndex
                            .map{case (bubble,idx) => bubble.copy(pos=idx)}
    
    if (bubbleData.nonEmpty) bubbleChart.applyOn(bubbleData)
    if (running) {
      Thread.sleep(1.second.toMillis)
      visualize()
    } else ()
  } 
  
  override def run() {
    visualize()
  }
}.start()


import org.apache.spark.sql.functions._
import org.apache.spark.sql.types._
import scala.concurrent.duration._
import scala.annotation.tailrec
updater: Unit = ()


In [ ]:
bubbleChart

res12: notebook.front.widgets.charts.CustomPlotlyChart[Seq[Bubble]] = <CustomPlotlyChart widget>
