In [0]:
dbutils.library.installPyPI("mlflow")
dbutils.library.restartPython()

In [0]:
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import *      # for window() function
from typing import List
from pyspark.sql.types import *
import pandas as pd
import time
from datetime import datetime, timedelta, timezone
import mlflow
from pyspark.sql.types import StructType

In [0]:
%sql
DROP TABLE raw_log_data_delta_PN;
DROP TABLE anomalies_data_delta_PN_;

In [0]:
%sql
CREATE TABLE raw_log_data_delta_PN (
  account_id STRING,
  agent_id STRING,
  event STRING,
  timestamp TIMESTAMP
 )
USING DELTA;


CREATE TABLE anomalies_data_delta_PN_ (
  user_id string,
  Ips LONG,
  prediction DOUBLE
)
USING DELTA;


In [0]:
inputPath = "dbfs:/mnt/kafka_raw/57c7413abca837e974000009/"

schema_raw_logs = (  StructType()
  .add("account_id","string")
  .add("agent_id","string")
  .add("event","string")
  .add("timestamp","timestamp") 
)


#reads From S3
eventsDF = (
  spark
    .readStream
    .schema(schema_raw_logs) # Set the schema of the JSON data
    .option("maxFilesPerTrigger", 1) # Treat a sequence of files as a stream by picking one file at a time
    .json(inputPath)
)

#writes to Raw table
(eventsDF.writeStream
  .outputMode("append")
  .option("checkpointLocation", "/mnt/delta/events/_checkpoints/etl-from-json_PN_v0.16")
  .table("raw_log_data_delta_PN")
)




In [0]:
#We may need to user OPTIMIZE, which deals with small files, merge them and compact them into larger files
raw_data = spark.readStream.format("delta").table("raw_log_data_delta_PN")




In [0]:
fullschema = (  StructType()
  .add("logger_event_id", "string")
  .add("logger_timestamp","timestamp")                
  .add("account_id","string")
  .add("agent_id","string")
  .add("event",StructType())
         .add("actor",StructType()
             .add("user_id","string")
             .add("ip_addresses",ArrayType(StringType()))
             .add("session_id","string")
             .add("impersonated_user_id","string")
             .add("id","string")
             .add("type","string")
             .add("user_agent","string")
         )
         .add("account_id","string")
         .add("event_type","string")
         .add("audit",StructType()
              .add("severity","string")
              .add("resource_id","string")
              .add("operation","string")
              .add("timestamp","timestamp")
              .add("status","string")
         )
         .add("logger_event_id","string")     
         .add("object",StructType())
         .add("timestamp","timestamp") 
   
   .add("timestamp","timestamp") 
)


In [0]:
run_id = "09840597c6e04f279aaa27be313c6e73"
model_uri = "runs:/" + run_id + "/sklearn-model"

#model = mlflow.pyfunc.spark_udf(spark, model_uri)
model = mlflow.spark.load_model(model_uri)


In [0]:



def foreach_batch_function(df, epoch_id):
    # You can put all your query codes here. 
    # You can read from your event raw table direcctly from here and do aggregation
    # In this example, raw_table_df is from the raw table, and you can aggregate or do any transformation you need on this based on your stream input

    
    #filter for the last 24h data
   # agg_df = raw_data.filter(
   #                               raw_data.timestamp <= 
   #                               datetime.strptime(str(datetime.now().astimezone(timezone.utc)
   #                               .strftime("%Y-%m-%dT%H:%M:%S.%f"))[:-3] ,"%Y-%m-%dT%H:%M:%S.%f") - timedelta(hours=24)
   #                             )
    
    #Parse Json
   # agg_df = raw_data.select("timestamp",from_json("event", fullschema).alias("data"))
    
    
    
    #selecting required columns only
   # agg_df = raw_data.select("timestamp","data.actor.user_id","data.actor.ip_addresses")
    
    #filter for null user_ids
   # agg_df = raw_data.filter(raw_table_df.user_id != None)
    
    
    result = (model
          .transform(agg_df)
          .groupBy("user_id", "ip_addresses")
          .count()
          .withColumn("prediction", udf('Ips'))
          .sort("user_id", "prediction")
             )
  
  
    result.write.format("delta").mode("append").table("anomalies_data_delta_PN_")
    

  


In [0]:
#query, apply foreach and write to table
query = (raw_data.writeStream 
        .foreachBatch(foreach_batch_function) 
        .option("checkpointLocation", "/mnt/delta/events/_checkpoints/anomalies_v.7")
        .start()
      )
   
  