# CDC Data Processing with Spark

This notebook demonstrates how to process Change Data Capture (CDC) events using Apache Spark.

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Initialize Spark Session
spark = SparkSession.builder \
    .appName("CDC_Processing_Demo") \
    .config("spark.sql.streaming.checkpointLocation", "/tmp/checkpoints") \
    .getOrCreate()

print(f"Spark version: {spark.version}")
print(f"Spark master: {spark.sparkContext.master}")

In [None]:
# Test basic Spark functionality
data = [(1, "Alice", 25), (2, "Bob", 30), (3, "Charlie", 35)]
columns = ["id", "name", "age"]

df = spark.createDataFrame(data, columns)
df.show()

In [None]:
# Read CDC data from Kafka (if available)
try:
    kafka_df = spark \
        .readStream \
        .format("kafka") \
        .option("kafka.bootstrap.servers", "kafka1:9092") \
        .option("subscribe", "ecommerce.public.customers") \
        .option("startingOffsets", "latest") \
        .load()
    
    print("Kafka stream created successfully!")
    print(kafka_df.schema)
except Exception as e:
    print(f"Could not connect to Kafka: {e}")

In [None]:
# Sample CDC data transformation
sample_cdc_data = [
    ('{"op":"c","after":{"id":1,"name":"John","email":"john@email.com"}}', 'customer.1'),
    ('{"op":"u","after":{"id":1,"name":"John Doe","email":"john.doe@email.com"}}', 'customer.1'),
    ('{"op":"d","before":{"id":2,"name":"Jane","email":"jane@email.com"}}', 'customer.2')
]

cdc_df = spark.createDataFrame(sample_cdc_data, ["value", "key"])

# Parse JSON and extract operation type
parsed_df = cdc_df.select(
    col("key"),
    get_json_object(col("value"), "$.op").alias("operation"),
    get_json_object(col("value"), "$.after").alias("after_data"),
    get_json_object(col("value"), "$.before").alias("before_data")
)

parsed_df.show(truncate=False)

In [None]:
# Stop Spark session
# spark.stop()