# **Assignment 4: PySpark Structured Streaming Using Kafka Source**

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.\
        builder.\
        appName("pyspark-kafka-streaming").\
        master("spark://spark-master:7077").\
        config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.12:3.0.0"). \
        config("spark.executor.memory", "512m").\
        getOrCreate()

## ==== Q2 ====

#### **Q2.1:** All your code for 2.1 should be in the following cell

In [None]:
#Answer to 2.1
df_streamed_raw = (spark
  .readStream
  .format("kafka") \
  .option("kafka.bootstrap.servers", "kafka:9093") \
  .option("subscribe", "topic_test1") \
  .load())


In [None]:
from pyspark.sql.types import StringType
from pyspark.sql.functions import col

# convert byte stream to string
df_streamed_kv = (df_streamed_raw
    .withColumn("key", df_streamed_raw["key"].cast(StringType()))
    .withColumn("value", df_streamed_raw["value"].cast(StringType())))

test_query = (df_streamed_kv 
              .writeStream \
              .format("memory") # output to memory \
              .outputMode("update") # only write updated rows to the sink \
              .queryName("test_query_table")  # Name of the in memory table \
              .start())

#### If all goes well, the following cell should display a table populated with values being streamed from you Kafka producer. NOTE: If you recently ran the producer, it may take a while before the table is populated. Keep rerunning the cell to check for updates:

In [None]:
spark.sql("select * from test_query_table").show()

In [None]:
test_query.stop()

#### The following cells contain code that take the streamed dataframe and formats it properly into a table. If any of the given cells fails, there might be a formatting issue with one of your previous solutions. 

In [None]:
from pyspark.sql.functions import from_json
from pyspark.sql.types import StructType, StructField, BooleanType, LongType, IntegerType

event_schema = StructType([
    StructField("station", StringType()),
    StructField("valid", StringType()),
    StructField("tmpf", StringType()),
    StructField("dwpf", StringType()),
    StructField("relh", StringType()),
    StructField("feel", StringType()),
    StructField("drct", StringType()),
    StructField("sped", StringType()),
    StructField("alti", StringType()),
    StructField("mslp", StringType()),
    StructField("p01m", StringType()),
    StructField("vsby", StringType()),
    StructField("skyc1", StringType()),
    StructField("skyl1", StringType()),
    StructField("wxcodes", StringType()),
    StructField("ice_acceretion_1hr", StringType()),
])

# Parse the events from JSON format
df_parsed = (df_streamed_kv
           # Sets schema for event data
           .withColumn("value", from_json("value", event_schema))
          )

In [None]:
from pyspark.sql.functions import to_timestamp, unix_timestamp

# Here, we need to convert date_time string to date_time object in the "dd/MMM/yyyy:HH:mm:ss Z" format.

df_formatted = (df_parsed.select(
    col("key").alias("event_key")
    ,col("topic").alias("event_topic")
    ,col("timestamp").alias("event_timestamp")
    ,col("value.station").alias("station")
    ,col("value.valid").alias("valid")
    ,col("value.tmpf").alias("tmpf")
    ,col("value.dwpf").alias("dwpf")
    ,col("value.relh").alias("relh")
    ,col("value.feel").alias("feel")
    ,col("value.drct").alias("drct")
    ,col("value.sped").alias("sped")
    ,col("value.alti").alias("alti")
    ,col("value.mslp").alias("mslp")
    ,col("value.p01m").alias("p01m")
    ,col("value.vsby").alias("vsby")
    ,col("value.skyc1").alias("skyc1")
    ,col("value.skyl1").alias("skyl1")
    ,col("value.wxcodes").alias("wxcodes")
    ,col("value.ice_acceretion_1hr").alias("ice_acceretion_1hr")
#     cast(IntegerType()).
))

#### **Q2.2:** All your code for 2.2 should be in the following cell


In [None]:
# Answer to 2.2
query = (df_formatted
            .writeStream
            .format("console")
            .trigger(processingTime='5 seconds')
            .outputMode("append")
            .option("truncate",'false')
            .start()
        )

In [None]:
# Print the name of active streams (This may be useful during debugging)
for s in spark.streams.active:
    print(f"ID:{s.id} | NAME:{s.name}")

In [None]:
query.stop()

# ==== Project - Start your feature extraction queries from here ====

#### **Q3.1:** All your code for 3.1 should be in the following cell

In [None]:
from pyspark.sql.functions import *

df_cumulative_count = (df_formatted
            .groupBy("event_topic")
            .count()
            .orderBy("event_topic"))

In [None]:
final_count=(df_cumulative_count
                .writeStream
                .outputMode("complete")
                .format("console")
                .trigger(processingTime="5 seconds")
                .option("truncate",'false')
                .start()
            )

In [None]:
final_count.stop()

#### **Q3.2:** All your code for 3.2 should be in the following cell

In [None]:
from pyspark.sql.functions import *

df_request_type = (df_formatted
                    .groupBy(window(df_formatted.event_timestamp, "10 seconds", "10 seconds"),df_formatted.request_type)
                    .count()
                 )

In [None]:
final_request_type= (df_request_type
          .writeStream
          .outputMode("complete")
          .format("console")
          .option("truncate",'false')
          .trigger(processingTime="5 seconds")
          .start())

In [None]:
final_request_type.stop()

#### **Q3.3:** All your code for 3.3 should be in the following cell


In [None]:
from pyspark.sql.functions import *

smAvg = (df_formatted
               .groupBy(window(df_formatted.event_timestamp, "10 seconds", "10 seconds"))
               .agg(avg("response_size")
               .alias("moving_average"))
               .writeStream
               .outputMode("complete")
               .format("console")
               .option("truncate",'false')
               .trigger(processingTime="10 seconds")
               .start()
        )

In [None]:
smAvg.stop()

#### **Q3.4:** All your code for 3.4 should be in the following cell

#### 3.4.1

In [None]:
from pyspark.sql.functions import *
df_select = (df_formatted
               .groupBy(window("event_timestamp", "10 seconds").alias('Time_Window'))
               .agg(
                    round(avg("response_size"),4).alias("Avg"),
                    round(stddev_samp("response_size"),4).alias("Standard_Dev"),
                    count("*").alias("Count"),
                    collect_list("response_size").alias("List")
                   )
               .select("Time_Window", "Avg", "Standard_Dev", "Count", "List")
            )

In [None]:
df_select_final= (df_select
                    .writeStream
                    .outputMode("complete")
                    .format("console")
                    .trigger(processingTime="10 seconds")
                    .option("truncate",'false')
                    .start()
                 )

In [None]:
df_select_final.stop()

#### 3.4.2

In [None]:
from pyspark.sql.functions import explode

df_explode_query = df_select.select("Time_Window", "Avg", "Standard_Dev", explode("List").alias("ResponseSize"))

In [None]:
df_explode_final= (df_explode_query
                    .writeStream
                    .outputMode("complete")
                    .format("console")
                    .trigger(processingTime="10 seconds")
                    .option("truncate",'false')
                    .start()
                  )

In [None]:
df_explode_final.stop()

#### 3.4.3

In [None]:
df_score = (df_explode_query.withColumn('z_score', expr('(ResponseSize - Avg) / Standard_Dev'))
            .filter("z_score > 1 or z_score < -1")
            .filter(~isnan(col("z_score")))
            .select("Time_Window", "Avg", "Standard_Dev","ResponseSize"))

In [None]:
final_data= (df_score
            .writeStream
            .outputMode("complete")
            .format("console")
            .option("truncate",'false')
            .trigger(processingTime="10 seconds")
            .start())

In [None]:
final_data.stop()