In [18]:
# Labraries
from kafka import KafkaProducer
import json, time, random, uuid
from datetime import datetime, timedelta
from pyspark.sql.functions import expr, col, from_json
from pyspark.sql.types import StringType, StructField, StructType, LongType

In [5]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import to_json, struct

# 1. Create Spark session
spark = (
    SparkSession
    .builder
    .appName("OrdersToKafka")
    .config('spark.streaming.stopGracefullyOnShutdown', True)
    .config("spark.jars.packages", 
            "org.apache.spark:spark-sql-kafka-0-10_2.12:3.4.1")
    .config('spark.sql.shuffle.partitions', 4)
    .master('local[*]')
    .getOrCreate()
)
# Read streaming data from order_data kafka:9092
order_df = (
    spark.readStream
    .format('kafka')
    .option('kafka.bootstrap.servers', 'kafka:9092')
    .option('subscribe', 'order_data')
    .option('startingOffsets', 'earliest')
    .load()
)

In [9]:
# Tranform data
order_trans = order_df.withColumn('value', expr('cast(value as string)'))

In [15]:
# Create a schema
order_schema = (
    StructType([
        StructField("order_id", StringType(), True),
        StructField("customer_id", StringType(), True),
        StructField("product_id", StringType(), True),
        StructField("product_name", StringType(), True),
        StructField("quantity", StringType(), True),
        StructField("price", StringType(), True),
        StructField("total_value", StringType(), True),
        StructField("order_date", StringType(), True)
    ])
)

# order data to table format
order_stream_df = order_trans.withColumn('value_json', from_json(col('value'), order_schema)).selectExpr('value_json.*')

# Write the output to console sink to check the output --to remove later
(order_stream_df
 .writeStream
 .format('console')
 .outputMode('append')
 .option('checkpointLocation', 'checkpoint_dir_kafka_order_data')
 .start()
 .awaitTermination()
)

In [26]:


order_stream_df.show()

+--------------------+-----------+----------+------------+--------+-----+-----------+----------+
|            order_id|customer_id|product_id|product_name|quantity|price|total_value|order_date|
+--------------------+-----------+----------+------------+--------+-----+-----------+----------+
|cd7a87b1-07f6-424...|   CUST0048|      P004|     Monitor|       2|  300|        600|2024-04-13|
|c89397b5-8397-4e4...|   CUST0056|      P003|  Headphones|       1|  150|        150|2024-06-11|
|02eb5720-0a54-422...|   CUST0176|      P004|     Monitor|       1|  300|        300|2024-01-20|
|e53b7ba7-03ed-4ea...|   CUST0135|      P002|       Phone|       1|  800|        800|2024-06-09|
|25a600b3-575d-4a2...|   CUST0048|      P003|  Headphones|       5|  150|        750|2024-06-19|
|7516e7cb-f0a6-4df...|   CUST0093|      P002|       Phone|       2|  800|       1600|2024-03-25|
|fc9535c1-7ae2-409...|   CUST0189|      P005|    Keyboard|       4|   60|        240|2024-02-14|
|120301c6-1f55-4be...|   CUST0