## Part 3 - Consume/Transform data with Spark Streaming

In [6]:
from pyspark.sql import SparkSession
from IPython.display import display, clear_output
import time
from pyspark.sql import functions as F
from pyspark.sql.types import StructType,StringType, StructField, IntegerType, FloatType, BinaryType
from pyspark.sql.functions import *

In [7]:
spark = SparkSession.builder \
        .appName('kafka') \
        .getOrCreate()

In [8]:
spark.version

'3.1.1'

In [9]:
spark._jvm.org.apache.hadoop.util.VersionInfo.getVersion()

'3.2.0'

## Raw Data Streams

In [10]:
def generate_stocktrades_stream():
    # Define the Spark Stream
    stream_df = spark \
  .readStream \
  .format("kafka") \
  .option("kafka.bootstrap.servers", "broker:29092") \
  .option("startingOffsets", "latest") \
  .option("subscribe", "STOCKTRADES_JSON") \
  .load()
    # Convert to string types
    string_stream_df = stream_df \
    .withColumn("key", stream_df["key"].cast(StringType())) \
      .withColumn('value', stream_df["value"].cast(StringType()))
    
    # Define the Schema
    schema_stocktrades =  StructType([
        StructField("SIDE", StringType(),  True),
        StructField("QUANTITY", IntegerType(),  True),
        StructField("PRICE", IntegerType(),  True),
        StructField("SYMBOL", StringType(),  True),
        StructField("ACCOUNT", StringType(), True),
         StructField("USERID", StringType(), True)
])
    # Convert the string type to json
    # make json_stream_df global var
    global json_stream_df
    json_stream_df = string_stream_df\
    .withColumn("value", F.from_json("value", schema_stocktrades))
    stocktrades_stream_df = json_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
        "value.side", \
        "value.quantity", \
        "value.price", \
        "value.symbol", \
        "value.account", \
        "value.userid"
    )
    return stocktrades_stream_df \
    .writeStream \
    .format("memory") \
    .queryName("stocktrades_view") \
    .start()

### **stocktrade_stream_df to json_stream_df to Parquet sink**

In [11]:
# Solution:
generate_stocktrades_stream()
stocktrade_stream_df = json_stream_df \
    .select( \
        F.col("key").alias("event_key"), \
        F.col("topic").alias("event_topic"), \
        F.col("timestamp").alias("event_timestamp"), \
        "value.side", \
        "value.quantity", \
        "value.price", \
        "value.symbol", \
        "value.account", \
        "value.userid"
    )

In [13]:
stocktrade_stream_df.writeStream \
    .format("parquet") \
    .option("header", True) \
    .option("path", "big_data_eng/bde_lab_8/") \
    .option("checkpointLocation", "checkpoint/data") \
    .outputMode("append") \
    .start()

<pyspark.sql.streaming.StreamingQuery at 0x7f816696a8b0>