In [0]:
dbutils.library.installPyPI("mlflow")

In [0]:
import mlflow
from pyspark.sql.types import StructType, ArrayType, StringType
from pyspark.sql.functions import col, approx_count_distinct, to_timestamp, to_date, to_json, struct, from_json

In [0]:
#reads From Kafka
eventsDF = (spark.readStream
  .format("kafka")
  .option("kafka.bootstrap.servers", "td-guardian.tdx.sandcitadel.com:9093")
  .option("subscribe", "event-splitter.audit_logs")
  .load())

#getting the event only
eventsDF = (
            eventsDF
            .selectExpr("CAST(value AS STRING)")
            .select( col("value").alias("event"))
           )

In [0]:
display(eventsDF)

In [0]:
fullschema = (  StructType()
  .add("timestamp","timestamp")                
  .add("account_id","string")
  .add("event_type","string")
  .add("event",StructType()
         .add("audit",StructType()
              .add("operation","string")
         )
         .add("actor",StructType()
             .add("user_id","string")
             .add("ip_address",ArrayType(StringType()))
         ) 
      )
)

In [0]:
run_id = "09840597c6e04f279aaa27be313c6e73"
model_uri = "runs:/" + run_id + "/sklearn-model"
model = mlflow.pyfunc.spark_udf(spark, model_uri)


filtered_data = (eventsDF
                  .select( from_json("event", fullschema).alias("data"))
                  .select( col("data.timestamp").alias("timestamp") ,   col("data.event.actor.user_id").alias("user_id"),  col("data.event.actor.ip_address").alias("ip_address") )   
                  .withColumn("timestamp",to_timestamp(to_date("timestamp","yyyy-MM-dd"),"yyyy-MM-dd"))
                  #.where(col("timestamp") >= datetime.now().astimezone(timezone.utc).strftime("%Y-%m-%dT00:00:00.000+0000") )
                  .withWatermark("timestamp", "5 seconds")
                  .groupBy("timestamp","user_id").agg(approx_count_distinct('ip_address').alias('Ips'))
                  #.groupBy(window("timestamp", "5 minute").alias("timestamp"),"user_id").agg(approx_count_distinct('ip_address').alias('Ips'))
                  
                  .select("user_id","timestamp", "Ips")
                  .withColumn("prediction", model("Ips"))  
                )

In [0]:
#display(filtered_data)

In [0]:
ds = (filtered_data.select(
    to_json(struct("user_id", "Ips", "timestamp", "prediction")).alias("value"))
                .writeStream
                .queryName("write_anomalies_to_kafka")
                .format("kafka")
                .option("kafka.bootstrap.servers", "td-guardian.tdx.sandcitadel.com:9093")
                .option("topic", "anomalies-databricks_3")
                .option("checkpointLocation", "/mnt/delta/events/_checkpoints/kafka-NC-BC-1.0")
                .start()
     )

In [0]:

#reads From Kafka
#anomalies = (spark.readStream
#  .format("kafka")
#  .option("kafka.bootstrap.servers", "td-guardian.tdx.sandcitadel.com:9093")
#  .option("subscribe", "anomalies-databricks_3")
#  .load())

In [0]:
#getting the event only
#anomalies_df = (
#            anomalies
#            .selectExpr("CAST(value AS STRING)")
#            .select( col("value").alias("event"))
#           )
#display(anomalies_df)
