## Load config

In [1]:
import sys
import os
from pathlib import Path

project_root = Path().absolute().parent
project_root

PosixPath('/home/jupyter/src-streaming/spark')

In [2]:
# Add src directory to Python path
src_path = project_root / "src"
sys.path.insert(0, str(src_path))

print(f"Project root: {project_root}")
print(f"Src path: {src_path}")
print(f"Src path exists: {src_path.exists()}")

Project root: /home/jupyter/src-streaming/spark
Src path: /home/jupyter/src-streaming/spark/src
Src path exists: True


In [3]:
from config.app_config import AppConfig

app_config = AppConfig()
app_config.spark

SparkConfig(app_name='Ecommerce CDC Processing', master='local[*]', shuffle_partitions=8, packages='org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,com.clickhouse:clickhouse-jdbc:0.6.0')

In [4]:
spark_configs = app_config.spark.get_spark_configs()
spark_configs

{'spark.streaming.stopGracefullyOnShutdown': True,
 'spark.jars.packages': 'org.apache.spark:spark-sql-kafka-0-10_2.12:3.5.0,com.clickhouse:clickhouse-jdbc:0.6.0',
 'spark.sql.shuffle.partitions': 8}

In [5]:
# Create the Spark Session
from pyspark.sql import SparkSession

builder = SparkSession.builder.appName(f'{app_config.spark.app_name} (products)')

for key, value in app_config.spark.get_spark_configs().items():
    builder = builder.config(key, value)

builder = builder.master(app_config.spark.master)
spark = builder.getOrCreate()

spark

## Read

In [6]:
print(app_config.kafka)

app_config.kafka.topics.get('orders')

KafkaConfig(bootstrap_servers='kafka1:9092', topics={'customers': 'pg.public.customers', 'orders': 'pg.public.orders', 'products': 'pg.public.products'})


'pg.public.orders'

In [7]:
kafka_stream = (
    spark
    .read
    .format('kafka')
    .option("kafka.bootstrap.servers", app_config.kafka.bootstrap_servers)
    .option("subscribe", app_config.kafka.topics.get('orders'))
    # .option("startingOffsets", "latest")
    .load()
)

kafka_stream.printSchema()

root
 |-- key: binary (nullable = true)
 |-- value: binary (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- timestamp: timestamp (nullable = true)
 |-- timestampType: integer (nullable = true)



### Apply schema

In [8]:
from pyspark.sql.functions import expr, col
from utils.helpers import decode_udf

In [9]:
# --- Parse binary raw message ---
kafka_json_df = (
    kafka_stream
    .withColumn('key_str', expr('cast(key as string)'))
    .withColumn('value_str', expr('cast(value as string)'))
)

In [10]:
kafka_json_df.show(3)

+--------------------+--------------------+----------------+---------+------+--------------------+-------------+--------+--------------------+
|                 key|               value|           topic|partition|offset|           timestamp|timestampType| key_str|           value_str|
+--------------------+--------------------+----------------+---------+------+--------------------+-------------+--------+--------------------+
|[7B 22 69 64 22 3...|[7B 22 62 65 66 6...|pg.public.orders|        0|     0|2025-08-19 08:57:...|            0|{"id":1}|{"before":null,"a...|
|[7B 22 69 64 22 3...|[7B 22 62 65 66 6...|pg.public.orders|        0|     1|2025-08-19 08:57:...|            0|{"id":2}|{"before":null,"a...|
|[7B 22 69 64 22 3...|[7B 22 62 65 66 6...|pg.public.orders|        0|     2|2025-08-19 08:57:...|            0|{"id":3}|{"before":null,"a...|
+--------------------+--------------------+----------------+---------+------+--------------------+-------------+--------+--------------------+

In [11]:
from pyspark.sql.types import StructType, IntegerType, StringType, LongType, StructField

# Schema for CDC value JSON (Debezium format)
key_schema = StructType([
    StructField("id", IntegerType(), True)
])


order_record_schema = StructType([
            StructField("id", IntegerType(), True),
            StructField("customer_id", IntegerType(), True),
            StructField("product_id", IntegerType(), True),
            StructField("quantity", IntegerType(), True),
            StructField("order_time", LongType(), True)
        ])
        

value_schema =  StructType([
            StructField("before", order_record_schema, True),
            StructField("after", order_record_schema, True),
            StructField("source", StructType([
                StructField("ts_ms", LongType(), True),
                StructField("schema", StringType(), True),
                StructField("table", StringType(), True)
            ]), True),
            StructField("op", StringType(), True),
            StructField("ts_ms", LongType(), True)
        ])
    

In [12]:
from pyspark.sql.functions import from_json

transformed_df = (
    kafka_json_df
    .withColumn("key_json", from_json(col("key_str"), key_schema))
    .withColumn('value_json', from_json(col('value_str'), value_schema))
    .drop('value', 'key')
)

In [13]:
transformed_df.show(2)

+----------------+---------+------+--------------------+-------------+--------+--------------------+--------+--------------------+
|           topic|partition|offset|           timestamp|timestampType| key_str|           value_str|key_json|          value_json|
+----------------+---------+------+--------------------+-------------+--------+--------------------+--------+--------------------+
|pg.public.orders|        0|     0|2025-08-19 08:57:...|            0|{"id":1}|{"before":null,"a...|     {1}|{null, {1, 5, 77,...|
|pg.public.orders|        0|     1|2025-08-19 08:57:...|            0|{"id":2}|{"before":null,"a...|     {2}|{null, {2, 5, 31,...|
+----------------+---------+------+--------------------+-------------+--------+--------------------+--------+--------------------+
only showing top 2 rows



### Handle CDC ops

In [15]:
from pyspark.sql.functions import when, lit

# --- Extract fields & handle CDC ops
# For create/update/read: Use 'after' + _version = ts_ms
# For delete: Insert with null fields or skip (for ReplacingMergeTree, insert with higher _version to replace)
cdc_df = transformed_df.select(
            # ID: after/before/key
            when(col("value_json.op").isin("c", "u", "r"), col("value_json.after.id"))
              .when(col("value_json.op") == "d", col("value_json.before.id"))
              .otherwise(col("key_json.id")).alias("id"),

            # Fields
            when(col("value_json.op").isin("c", "u", "r"), col("value_json.after.customer_id"))
              .otherwise(lit(None)).alias("customer_id"),

            when(col("value_json.op").isin("c", "u", "r"), col("value_json.after.product_id"))
              .otherwise(lit(None)).alias("product_id"),

            when(col("value_json.op").isin("c", "u", "r"), col("value_json.after.quantity"))
              .otherwise(lit(None)).alias("quantity"),

            when(col("value_json.op").isin("c", "u", "r"), col("value_json.after.order_time"))
              .otherwise(lit(None)).alias("order_time"),

            # Version & delete flag
            col("value_json.ts_ms").alias("_version"),
            when(col("value_json.op") == "d", lit(1)).otherwise(lit(0)).alias("_deleted")
        )

In [16]:
cdc_df.show()

+---+-----------+----------+--------+----------------+-------------+--------+
| id|customer_id|product_id|quantity|      order_time|     _version|_deleted|
+---+-----------+----------+--------+----------------+-------------+--------+
|  1|          5|        77|       2|1755331695560122|1755593845773|       0|
|  2|          5|        31|       4|1755331741489021|1755593845773|       0|
|  3|          6|        75|       1|1755331741493392|1755593845774|       0|
|  4|          2|        73|       2|1755331741497550|1755593845774|       0|
|  5|          4|        48|       2|1755331741501933|1755593845774|       0|
|  6|          8|       100|       4|1755331741506137|1755593845775|       0|
|  7|          1|         7|       4|1755331741510234|1755593845775|       0|
|  8|         10|        17|       2|1755331741514410|1755593845775|       0|
|  9|          7|        90|       3|1755331741518874|1755593845776|       0|
| 10|          9|        64|       5|1755331741522757|1755593845

## Write to Clickhouse

In [17]:
# ClickHouse connection properties
clickhouse_properties = app_config.clickhouse.connection_properties
print(clickhouse_properties)

{'user': 'default', 'password': 'clickhouse123', 'driver': 'com.clickhouse.jdbc.ClickHouseDriver'}


In [18]:
# --- Write to Console (for debugging) ---
print("\n📊 Writing to console for debugging:")
cdc_df.show(20, truncate=False)

# --- Write to ClickHouse ---
try:
    print("\n💾 Writing to ClickHouse...")
    
    # Write to ClickHouse table (ReplacingMergeTree with _version)
    (cdc_df
     .write
     .format("jdbc")
     .option("url", app_config.clickhouse.jdbc_url)
     .option("dbtable", "orders_cdc")  # Table name in ClickHouse
     .option("user", clickhouse_properties['user'])
     .option("password", clickhouse_properties['password'])
     .option("driver", clickhouse_properties['driver'])
     .mode("append")  # Always append for CDC
     .save()
    )
    
    print("✅ Data written to ClickHouse successfully!")
    
except Exception as e:
    print(f"❌ Error writing to ClickHouse: {e}")
    print("📝 Writing to console only...")
    cdc_df.show()


📊 Writing to console for debugging:
+---+-----------+----------+--------+----------------+-------------+--------+
|id |customer_id|product_id|quantity|order_time      |_version     |_deleted|
+---+-----------+----------+--------+----------------+-------------+--------+
|1  |5          |77        |2       |1755331695560122|1755593845773|0       |
|2  |5          |31        |4       |1755331741489021|1755593845773|0       |
|3  |6          |75        |1       |1755331741493392|1755593845774|0       |
|4  |2          |73        |2       |1755331741497550|1755593845774|0       |
|5  |4          |48        |2       |1755331741501933|1755593845774|0       |
|6  |8          |100       |4       |1755331741506137|1755593845775|0       |
|7  |1          |7         |4       |1755331741510234|1755593845775|0       |
|8  |10         |17        |2       |1755331741514410|1755593845775|0       |
|9  |7          |90        |3       |1755331741518874|1755593845776|0       |
|10 |9          |64        