In [1]:
# Create the Spark Session
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import from_json, col, to_timestamp, window, expr, sum
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

spark = (
    SparkSession.builder
    .appName("Streaming from Kafka with tumbling window") 
    .config("spark.streaming.stopGracefullyOnShutdown", True)
    .config(
        "spark.jars.packages",
        "org.apache.spark:spark-sql-kafka-0-10_2.12:3.3.0,"
        "org.apache.spark:spark-streaming-kafka-0-10-assembly_2.12:3.3.0"
    )
    .master("local[*]")
    .getOrCreate()
)
spark

In [2]:
# docker exec -it kafka kafka-console-producer --bootstrap-server localhost:9092 --topic trades

In [3]:
# Create the kafka_df to read from kafka

kafka_df = (
    spark
    .readStream
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "trades")
    .option("startingOffsets", "earliest")
    .load()
)

stock_schema = StructType([
        StructField("CreatedTime", StringType()),
        StructField("Type", StringType()),
        StructField("Amount", IntegerType()),
        StructField("BrokerCode", StringType())
    ])

value_df = kafka_df.select(from_json(col("value").cast("string"), stock_schema).alias("value"))

trade_df = value_df.select("value.*") \
    .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Buy", expr("case when Type == 'BUY' then Amount else 0 end")) \
    .withColumn("Sell", expr("case when Type == 'SELL' then Amount else 0 end"))

window_agg_df = trade_df \
    .groupBy(  # col("BrokerCode"),
     window(col("CreatedTime"), "15 minute")) \
    .agg(sum("Buy").alias("TotalBuy"),
         sum("Sell").alias("TotalSell"))

output_df = window_agg_df.select("window.start", "window.end", "TotalBuy", "TotalSell")

window_query = output_df.writeStream \
        .format("console") \
        .outputMode("update") \
        .option("checkpointLocation", "chk-point-dir") \
        .trigger(processingTime="1 minute") \
        .start()

window_query.awaitTermination()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
               ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/usr/local/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
                          ^^^^^^^^^^^^^^^^^^^^^^
  File "/opt/conda/lib/python3.11/socket.py", line 706, in readinto
    return self._sock.recv_into(b)
           ^^^^^^^^^^^^^^^^^^^^^^^
KeyboardInterrupt


KeyboardInterrupt: 

In [4]:
kafka_df = (
    spark
    .read
    .format("kafka")
    .option("kafka.bootstrap.servers", "kafka:9092")
    .option("subscribe", "trades")
    .option("startingOffsets", "earliest")
    .load()
)

stock_schema = StructType([
        StructField("CreatedTime", StringType()),
        StructField("Type", StringType()),
        StructField("Amount", IntegerType()),
        StructField("BrokerCode", StringType())
    ])

value_df = kafka_df.select(from_json(col("value").cast("string"), stock_schema).alias("value"))

trade_df = value_df.select("value.*") \
    .withColumn("CreatedTime", to_timestamp(col("CreatedTime"), "yyyy-MM-dd HH:mm:ss")) \
    .withColumn("Buy", expr("case when Type == 'BUY' then Amount else 0 end")) \
    .withColumn("Sell", expr("case when Type == 'SELL' then Amount else 0 end"))

window_agg_df = trade_df \
    .groupBy(  # col("BrokerCode"),
     window(col("CreatedTime"), "15 minute")) \
    .agg(sum("Buy").alias("TotalBuy"),
         sum("Sell").alias("TotalSell"))

output_df = window_agg_df.select("window.start", "window.end", "TotalBuy", "TotalSell")


running_total_window = Window.orderBy("end") \
    .rowsBetween(Window.unboundedPreceding, Window.currentRow)

final_output_df = output_df \
    .withColumn("RTotalBuy", sum("TotalBuy").over(running_total_window)) \
    .withColumn("RTotalSell", sum("TotalSell").over(running_total_window)) \
    .withColumn("NetValue", expr("RTotalBuy - RTotalSell"))

final_output_df.show(truncate=False)

+-------------------+-------------------+--------+---------+---------+----------+--------+
|start              |end                |TotalBuy|TotalSell|RTotalBuy|RTotalSell|NetValue|
+-------------------+-------------------+--------+---------+---------+----------+--------+
|2019-02-05 10:00:00|2019-02-05 10:15:00|800     |0        |800      |0         |800     |
|2019-02-05 10:15:00|2019-02-05 10:30:00|800     |400      |1600     |400       |1200    |
|2019-02-05 10:30:00|2019-02-05 10:45:00|900     |0        |2500     |400       |2100    |
|2019-02-05 10:45:00|2019-02-05 11:00:00|0       |600      |2500     |1000      |1500    |
+-------------------+-------------------+--------+---------+---------+----------+--------+

