In [1]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, window, count, avg, sum, to_date, to_timestamp

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("Streaming Aggregation") \
    .master("yarn") \
    .config("spark.submit.deployMode", "client") \
    .config("spark.sql.shuffle.partitions", "4") \
    .config("spark.sql.catalog.iceberg", "org.apache.iceberg.spark.SparkCatalog")\
    .config("spark.sql.catalog.iceberg.type", "hadoop")\
    .config("spark.sql.catalog.iceberg.warehouse", "gs://gks-tpch/iceberg_warehouse")\
    .getOrCreate()
 
# Input path where streaming files arrive
streaming_path = "gs://tpch-source/orders-partittions2/"
output_path = "gs://tpch-source/orders-aggregates/"



Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/02/06 12:15:11 INFO SparkEnv: Registering MapOutputTracker
25/02/06 12:15:11 INFO SparkEnv: Registering BlockManagerMaster
25/02/06 12:15:11 INFO SparkEnv: Registering BlockManagerMasterHeartbeat
25/02/06 12:15:11 INFO SparkEnv: Registering OutputCommitCoordinator


In [2]:

# Define Schema for Orders
schema = """
    o_orderkey INT,
    o_custkey INT,
    o_orderstatus STRING,
    o_totalprice DOUBLE,
    o_orderdate STRING,
    o_orderpriority STRING,
    o_clerk STRING,
    o_shippriority INT,
    o_comment STRING
"""

# Read Streaming Data
df_stream = spark.readStream \
    .option("header", "true") \
    .schema(schema) \
    .csv(streaming_path)

df_stream.printSchema()

root
 |-- o_orderkey: integer (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- o_orderstatus: string (nullable = true)
 |-- o_totalprice: double (nullable = true)
 |-- o_orderdate: string (nullable = true)
 |-- o_orderpriority: string (nullable = true)
 |-- o_clerk: string (nullable = true)
 |-- o_shippriority: integer (nullable = true)
 |-- o_comment: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [3]:

# Convert order date to proper Date format
# df_stream = df_stream.withColumn("o_orderdate", to_date(col("o_orderdate")))
df_stream = df_stream.withColumn("o_orderdate", to_timestamp(col("o_orderdate"), "yyyy-MM-dd"))

df_stream.printSchema()

root
 |-- o_orderkey: integer (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- o_orderstatus: string (nullable = true)
 |-- o_totalprice: double (nullable = true)
 |-- o_orderdate: timestamp (nullable = true)
 |-- o_orderpriority: string (nullable = true)
 |-- o_clerk: string (nullable = true)
 |-- o_shippriority: integer (nullable = true)
 |-- o_comment: string (nullable = true)
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)



In [4]:

#     .withWatermark("o_orderdate", "2 months") \

# Aggregate by Month & Customer Key
aggregated_df = df_stream \
    .groupBy(window(col("o_orderdate"), "7 days"), col("o_custkey")) \
    .agg(
        count("o_orderkey").alias("total_orders"),
        avg("o_totalprice").alias("avg_totalprice"),
        sum("o_totalprice").alias("sum_totalprice")
    ) \
    .select(
        col("window.start").alias("month_start"),
        col("window.end").alias("month_end"),
        col("o_custkey"),
        col("total_orders"),
        col("avg_totalprice"),
        col("sum_totalprice")
    )

aggregated_df.printSchema()

root
 |-- month_start: timestamp (nullable = true)
 |-- month_end: timestamp (nullable = true)
 |-- o_custkey: integer (nullable = true)
 |-- total_orders: long (nullable = false)
 |-- avg_totalprice: double (nullable = true)
 |-- sum_totalprice: double (nullable = true)



In [8]:

# Write Streaming Aggregated Results to GCS
aggregated_df.writeStream \
    .outputMode("complete") \
    .option("checkpointLocation", "gs://tpch-source/checkpoints/gks-aggregated_orders/") \
    .option("header", "true") \
    .format("iceberg") \
    .trigger(processingTime="5 minutes") \
    .toTable("iceberg.tpch.aggregated_orders")
    # .start(output_path) \
    # .awaitTermination()

25/02/06 12:21:39 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


<pyspark.sql.streaming.query.StreamingQuery at 0x7fd78b12b150>

25/02/06 12:21:48 WARN FileStreamSource: Listed 185 file(s) in 5496 ms          
25/02/06 12:25:06 WARN FileStreamSource: Listed 224 file(s) in 6205 ms          
25/02/06 12:30:08 WARN FileStreamSource: Listed 284 file(s) in 8234 ms          
25/02/06 12:35:09 WARN FileStreamSource: Listed 343 file(s) in 9538 ms          
25/02/06 12:40:10 WARN FileStreamSource: Listed 403 file(s) in 10886 ms         
                                                                                