In [0]:
#dbutils.library.installPyPI("mlflow")
#dbutils.library.restartPython()
from pyspark.sql.functions import from_json, col, approx_count_distinct, last, window, to_timestamp, to_date, count
from pyspark.sql.types import StructType, ArrayType, StringType, FloatType
from datetime import datetime, timedelta, timezone
import mlflow

In [0]:
#client = mlflow.tracking.MlflowClient()
mlflow_model_name = 'iForest'
#version = client.search_model_versions("name='{}'".format(mlflow_model_name))[0].version
version = 'Production'
udf = mlflow.pyfunc.spark_udf(spark, 'models:/{}/{}'.format(mlflow_model_name, version))

In [0]:
#schema
raw_data = spark.readStream.format("delta").table("raw_log_data_delta_PN_")

fullschema = (  StructType()
  .add("logger_event_id", "string")
  .add("logger_timestamp","timestamp")                
  .add("account_id","string")
  .add("agent_id","string")
  .add("event",StructType())
         .add("actor",StructType()
             .add("user_id","string")
             .add("ip_addresses",ArrayType(StringType()))
             .add("session_id","string")
             .add("impersonated_user_id","string")
             .add("id","string")
             .add("type","string")
             .add("user_agent","string")
         )
         .add("account_id","string")
         .add("event_type","string")
         .add("audit",StructType()
              .add("severity","string")
              .add("resource_id","string")
              .add("operation","string")
              .add("timestamp","timestamp")
              .add("status","string")
         )
         .add("logger_event_id","string")     
         .add("object",StructType())
         .add("timestamp","timestamp") 
   
   .add("timestamp","timestamp") 
)

In [0]:
filtered_data = (raw_data
                  .select( "timestamp"    ,from_json("event", fullschema).alias("data"))
                  .withColumn("timestamp",to_timestamp(to_date("timestamp","yyyy-MM-DD"),"yyyy-MM-DD"))
                  .select( "timestamp" ,   "data.actor.id",  "data.actor.ip_addresses", "data.actor.user_agent", "data.audit.operation")       
                  .where(col("id").isNotNull())
                  .where("data.audit.operation = 'contact_read'")
                  .withWatermark("timestamp", "1 minute")
                  .groupBy(col("timestamp"),"id").agg(approx_count_distinct('ip_addresses').alias('nr_ip_addresses'),
                                                           approx_count_distinct('user_agent').alias('nr_user_agents'),
                                                           count('operation').alias('nr_contact_reads')
                                                          )
                  .withColumn("prediction", udf("nr_ip_addresses", "nr_user_agents", "nr_contact_reads"))
                )

In [0]:
display(filtered_data)

timestamp,id,nr_ip_addresses,nr_user_agents,nr_contact_reads,prediction
2020-08-17T00:00:00.000+0000,5e31c5158d8675000c0bdd45,1,1,1,1.0
2020-08-21T00:00:00.000+0000,5ed1400323b4053aabf20e8c,3,1,36,1.0
2020-08-21T00:00:00.000+0000,5ed043929a90b842e44813aa,3,1,6,1.0
2020-08-18T00:00:00.000+0000,5f0c1df2bbd1357705b055df,3,3,12,1.0
2020-08-17T00:00:00.000+0000,5eb463da4bb3c5000c0fb1bd,3,1,20,1.0
2020-08-19T00:00:00.000+0000,5e1408043c5da30009fac934,3,2,25,1.0
2020-08-24T00:00:00.000+0000,5df117fa8b56d9000e8f1fc8,2,1,2,1.0
2020-08-31T00:00:00.000+0000,5efa582d1098707f3ae8f714,3,2,29,1.0
2020-08-27T00:00:00.000+0000,5da4c126d11afb00090b2f5b,2,1,2,1.0
2020-08-20T00:00:00.000+0000,5c7eda9ee08fa40008be7ef7,4,1,14,1.0


In [0]:
#%sql
#DROP TABLE anomalies_data_delta_NC_

In [0]:
%sql
CREATE TABLE IF NOT EXISTS anomalies_data_delta_NC_ (
  id string,
  nr_ip_addresses LONG,
  nr_user_agents LONG,
  nr_contact_reads LONG,
  timestamp timestamp,
  prediction double
)
USING DELTA;

In [0]:
 #writes to Raw table
(filtered_data.writeStream
  .outputMode("append")
  .format("delta")
  .option("checkpointLocation", "/mnt/delta/events/_checkpoints/anomalies_v.16")
  .table("anomalies_data_delta_NC_")
)