# Change Data Capture (CDC) with Apache Hudi and PySpark
This notebook demonstrates how to perform Change Data Capture (CDC) using Apache Hudi with PySpark. We will create a Hudi table, insert initial data, perform updates, and query incremental changes.

In [None]:
from pyspark.sql import SparkSession

spark = SparkSession.builder \
    .appName("SimpleHudiCreate") \
    .config("spark.serializer", "org.apache.spark.serializer.KryoSerializer") \
    .config("spark.sql.extensions", "org.apache.spark.sql.hudi.HoodieSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.hudi.catalog.HoodieCatalog") \
    .getOrCreate()

In [None]:
import os

base_path = "/home/jovyan/hudi"
table_name = "hudi_cdc_table"
table_path = os.path.join(base_path, table_name)

In [None]:
from pyspark.sql import Row

# Initial data
data = [Row(id=1, name="Alice", ts=1000),
        Row(id=2, name="Bob", ts=1000),
        Row(id=3, name="Charlie", ts=1000)]

df = spark.createDataFrame(data)

# Write initial data to Hudi
df.write.format("hudi") \
    .option("hoodie.table.name", table_name) \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "ts") \
    .option("hoodie.datasource.write.operation", "insert") \
    .mode("overwrite") \
    .save(table_path)

In [None]:
# Updated data
update_data = [Row(id=2, name="Bob Updated", ts=2000),
               Row(id=4, name="Daisy", ts=2000)]

update_df = spark.createDataFrame(update_data)

# Write updates to Hudi
update_df.write.format("hudi") \
    .option("hoodie.table.name", table_name) \
    .option("hoodie.datasource.write.recordkey.field", "id") \
    .option("hoodie.datasource.write.precombine.field", "ts") \
    .option("hoodie.datasource.write.operation", "upsert") \
    .mode("append") \
    .save(table_path)

In [6]:
# Get the latest commit time
commits = spark.read.format("hudi").load(table_path).select("_hoodie_commit_time").distinct().orderBy("_hoodie_commit_time", ascending=False)

commit_times = [row['_hoodie_commit_time'] for row in commits.collect()]
begin_time = commit_times[1]  # second latest commit

# Read incrementally
incremental_df = spark.read.format("hudi") \
    .option("hoodie.datasource.query.type", "incremental") \
    .option("hoodie.datasource.read.begin.instanttime", begin_time) \
    .load(table_path)

incremental_df.show()

+-------------------+--------------------+------------------+----------------------+--------------------+---+-----------+----+
|_hoodie_commit_time|_hoodie_commit_seqno|_hoodie_record_key|_hoodie_partition_path|   _hoodie_file_name| id|       name|  ts|
+-------------------+--------------------+------------------+----------------------+--------------------+---+-----------+----+
|  20250714112815243|20250714112815243...|                 1|                      |a25a07da-58a5-419...|  1|      Alice|1000|
|  20250714112900617|20250714112900617...|                 2|                      |a25a07da-58a5-419...|  2|Bob Updated|2000|
|  20250714112815243|20250714112815243...|                 3|                      |a25a07da-58a5-419...|  3|    Charlie|1000|
|  20250714112900617|20250714112900617...|                 4|                      |a25a07da-58a5-419...|  4|      Daisy|2000|
+-------------------+--------------------+------------------+----------------------+--------------------+---+--