In [0]:
#dbutils.library.installPyPI("mlflow")
#dbutils.library.restartPython()
from pyspark.sql.types import StructType
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType
from pyspark.sql.functions import *      # for window() function
from typing import List

In [0]:
### method 1 to create tables:
### - create schema
### - create empty spark dataframe whith schema
### - create tables from spark dataframe

#creating schemas
schema_raw_logs = (  StructType()
  .add("account_id","string")
  .add("agent_id","string")
  .add("event",StructType()
         .add("actor",StructType()
             .add("user_id","string")
             .add("ip_addresses",ArrayType(StructType([])))
             .add("session_id","string")
             .add("impersonated_user_id","string")
             .add("id","string")
             .add("type","string")
             .add("user_agent","string")
         )
         .add("account_id","string")
         .add("event_type","string")
         .add("audit",StructType()
              .add("severity","string")
              .add("resource_id","string")
              .add("operation","string")
              .add("timestamp","timestamp")
              .add("status","string")
         )
         .add("logger_event_id","string")     
         .add("object",StructType())
         .add("timestamp","timestamp") 
   )
   .add("timestamp","timestamp") 
)


schema_raw_anomalies = (  StructType()
  .add("anomaly_timestamp","timestamp")
  .add("bucket_start_timestamp","timestamp")
  .add("bucket_finish_timestamp","timestamp")  
  .add("bucket_interval","string")
  .add("account_id","string")
  .add("user_id","string")
  .add("anomaly","integer")
  .add("probability","double")
  .add("top_contributers_1","string")
  .add("top_contributers_2","string")       
  .add("top_contributers_3","string")
)

#create spark dataframes with schemas
#df_logs = spark.createDataFrame(spark.sparkContext.emptyRDD(),schema_raw_logs)
#df_anomalies = spark.createDataFrame(spark.sparkContext.emptyRDD(),schema_raw_anomalies)

#create tables using pre-existent dataframes
# not working yet
#df_logs.write.format("delta").save("/delta/raw_log_data_delta")
#df_anomalies.write.format("delta").save("/delta/anomalies_data_delta")

In [0]:
#method 2 is creating the tables recurring to SQL (bellow)

In [0]:
%sql
CREATE TABLE br_raw_log_data_delta_ (
  account_id STRING,
  agent_id STRING,
  user_id STRING,
  ip_addresses STRING,
  session_id STRING,
  impersonated_user_id STRING,
  id STRING,
  type STRING,
  user_agent STRING,
  event_type STRING,
  severity STRING,
  resource_id STRING,
  operation STRING,
  status STRING,
  logger_event_id STRING,
  object STRING,
  timestamp DATE
 )
USING DELTA;


CREATE TABLE br_anomalies_data_delta_ (
  anomaly_timestamp DATE,
  bucket_start_timestamp DATE,
  bucket_finish_timestamp DATE,
  bucket_interval STRING,
  account_id STRING,
  user_id STRING,
  anomaly INTEGER,
  probability DOUBLE,
  top_contributers_1 STRING,
  top_contributers_2 STRING,
  top_contributers_3 STRING
)
USING DELTA;

In [0]:
%sql
SHOW TABLES FROM DEFAULT;
DESCRIBE DETAIL DEFAULT.better_ip_results;

format,id,name,description,location,createdAt,lastModified,partitionColumns,numFiles,sizeInBytes,properties,minReaderVersion,minWriterVersion
parquet,,default.better_ip_results,,dbfs:/user/hive/warehouse/better_ip_results,2020-09-22T14:28:55.000+0000,,List(),,,Map(),,


In [0]:
ACCESS_KEY = dbutils.secrets.get(scope = "aws-s3-prd", key = "ACCESS_KEY_ID")
SECRET_KEY = dbutils.secrets.get(scope = "aws-s3-prd", key = "SECRET_ACCESS_KEY")
ENCODED_SECRET_KEY = SECRET_KEY.replace("/", "%2F")
AWS_BUCKET_NAME = "td-infra-prd-us-east-1-s3-guardian"
MOUNT_NAME = "kafka_raw"


#display(dbutils.fs.ls("/mnt/%s" % MOUNT_NAME))

schema = spark.read.json("dbfs:/mnt/kafka_raw/57c7413abca837e974000009/audit_logs_1590969600000-1598918400000.json").schema






In [0]:
%fs head "dbfs:/mnt/kafka_raw/57c7413abca837e974000009/audit_logs_1590969600000-1598918400000.json"

In [0]:
schema.json()

In [0]:
data = spark.readStream \
        .schema(schema) \
        .format("json").load("/mnt/kafka_raw/57c7413abca837e974000009/")

In [0]:
from pyspark.sql.functions import *

#reads current data as a stream
streamingInputDF = spark \
  .readStream \
  .schema(schema) \
  .format("json") \
  .option("maxFilesPerTrigger", 1) \
  .load("/mnt/kafka_raw/57c7413abca837e974000009/")

# Same query as staticInputDF
streamingCountsDF = (                 
  streamingInputDF
    .groupBy(
      streamingInputDF.account_id, 
      window(streamingInputDF.timestamp, "1 second"))
    .count()
)

# Is this DF actually a streaming DF?
streamingCountsDF.isStreaming

query = (
  streamingCountsDF
    .writeStream
    .format("memory")        # memory = store in-memory table 
    .queryName("counts")     # counts = name of the in-memory table
    .outputMode("complete")  # complete = all the counts should be in the table
    .start()
)
