**Incremental Data Loads using Kafka**

In [None]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"
#
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-sql-kafka-0-10_2.12:3.2.1 pyspark-shell'
import pyspark;
sc = pyspark.SparkContext.getOrCreate();
from pyspark.sql import SparkSession;
spark = SparkSession(sc)

In [None]:
# Reading a stream of events from a Kafka topic, called retail-events
# The events in a Kafka topic follow a key-value pattern. This means that our actual data is encoded within a JSON object in the value column that we need to extract
#       
kafka_df = (spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "localhost:9092")
    .option("subscribe", "retail-events")
    .option("startingOffsets", "latest")
    .option("failOnDataLoss", "false")
    .load()
)


In [None]:
# write kafka_df to console
query = kafka_df.selectExpr("CAST(key AS STRING)", "CAST(value AS STRING)") \
    .writeStream \
    .format("console") \
    .option("checkpointLocation", "/tmp/data-lake/") \
    .start() 

In [None]:
query.stop()

In [None]:
# actual data is encoded within a JSON object in the value column that we need to extract
from pyspark.sql.types import StructType, StructField, StringType, IntegerType, TimestampType, DoubleType
eventSchema = (StructType()
  .add('InvoiceNo', StringType())
  .add('StockCode', StringType())
  .add('Description', StringType())
  .add('Quantity', IntegerType())
  .add('InvoiceDate', StringType())
  .add('UnitPrice', DoubleType())
  .add('CustomerID', IntegerType())
  .add('Country', StringType())
)
# we extract the data using the from_json() function by passing in the data schema object
# This results in a retail_df DataFrame that has all of the columns of the event that we require
# Additionally, we append an EventTime column from the Kafka topic, which shows when the event actually arrived in Kafka.
# Since this DataFrame was created using the readStream() function it's a Streaming DataFrame and Structured Streaming APIs are available
from pyspark.sql.functions import col, from_json, to_date
retail_df = (kafka_df\
   .select(from_json(col("value").cast(StringType()), eventSchema).alias("message"), col("timestamp").alias("EventTime"))\
   .select("message.*", "EventTime")
)

In [None]:
# write retail_df to console
query = retail_df.writeStream.format("console").option("checkpointLocation", "/tmp/data-lake/").start()
 

In [33]:
# Once we have extracted the raw event data from the Kafka stream, we can persist it to the data lake
#
# make use of the writeStream() function that is available to Streaming DataFrames to save data to the data lake in a streaming fashion
# Once saved, these Parquet files are no different from any other Parquet files, whether created by batch processing or streams processing
# Additionally, we use outputMode as append to indicate that we will treat this as an unbounded dataset and will keep appending new Parquet files.
# The checkpointLocation option stores the Structured Streaming write-ahead log and other checkpointing information
# This makes it an incremental data load job as the stream only picks up new and unprocessed events based on the offset information stored at the checkpoint location.
#
base_path = "/tmp/data-lake/retail_events.parquet"
(retail_df
    .withColumn("EventDate", to_date(retail_df.EventTime))
    .writeStream
    .format('parquet')
    .outputMode("append")
    .trigger(once=True)
    .option('checkpointLocation', base_path + '/_checkpoint')
    .start(base_path)
)

22/05/23 18:18:59 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.StreamingQuery at 0x7ff37c04c730>

                                                                                

In [34]:
df2 = spark.read.parquet("/tmp/data-lake/retail_events.parquet").show()

+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------------+----------+
|InvoiceNo|StockCode|         Description|Quantity|   InvoiceDate|UnitPrice|CustomerID|       Country|           EventTime| EventDate|
+---------+---------+--------------------+--------+--------------+---------+----------+--------------+--------------------+----------+
|   536365|   85123A|WHITE HANGING HEA...|       6|01/12/10 08:26|     2.55|     17850|United Kingdom|2022-05-23 15:32:...|2022-05-23|
|   536365|    71053| WHITE METAL LANTERN|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|2022-05-23 15:32:...|2022-05-23|
|   536365|   84406B|CREAM CUPID HEART...|       8|01/12/10 08:26|     2.75|     17850|United Kingdom|2022-05-23 15:32:...|2022-05-23|
|   536365|   84029G|KNITTED UNION FLA...|       6|01/12/10 08:26|     3.39|     17850|United Kingdom|2022-05-23 15:32:...|2022-05-23|
|   536365|   84029E|RED WOOLLY HOTTIE...|       6|01/1