In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("..")

In [4]:
from cdc_pipelines.common.config_loader import ConfigLoader
from cdc_pipelines.common.spark_session import get_spark_session
from cdc_pipelines.common.logger import setup_logging, PipelineLogger
from cdc_pipelines.pipelines.bronze.kafka_reader import KafkaReader
from cdc_pipelines.pipelines.bronze.bronze_writer import BronzeWriter

In [None]:
base_dir = "/home/longnguyen/cdc-data-pipeline/config/pipeline_config"

config_loader = ConfigLoader(base_dir, environment="dev")
config = config_loader.load_config()

In [None]:
# Setup logging
logging_config = config.get("logging", {})
setup_logging(
    log_level=logging_config.get("level", "INFO"),
    log_path=f"{logging_config.get('path', 'logs')}/bronze/bronze_pipeline.log",
)

logger = PipelineLogger(__name__, "bronze", "bronze")
logger.info("Starting bronze layer pipeline")

# Create Spark session
spark = get_spark_session(config)

# Initialize components
kafka_reader = KafkaReader(spark, config)

# Get bronze configuration
bronze_config = config.get("bronze", {})
trigger_interval = bronze_config.get("trigger_interval", "10 seconds")


2025-12-24 15:59:47,669 - __main__ - INFO - [BRONZE] Starting bronze layer pipeline
2025-12-24 15:59:47,671 - cdc_pipelines.common.spark_session - INFO - Creating new Spark session


:: loading settings :: url = jar:file:/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/longnguyen/.ivy2.5.2/cache
The jars for the packages stored in: /home/longnguyen/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
io.delta#delta-spark_2.13 added as a dependency
com.google.cloud.bigdataoss#gcs-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-c99c2df9-3fcb-4b0b-9323-9ae8f093e2b0;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;4.0.0 in central
	found org.apache.kafka#kafka-clients;3.9.0 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.7 in central
	found org.slf4j#slf4j-api;2.0.16 in central
	found org.apache.hadoop#hadoop-client-runti

2025-12-24 16:00:03,290 - cdc_pipelines.common.spark_session - INFO - Spark session created: cdc-streaming-pipeline-dev
2025-12-24 16:00:03,294 - cdc_pipelines.common.spark_session - INFO - Spark version: 4.0.0


In [None]:
table_name = "products"  # for default

# Determine which tables to process
tables_to_process = {}
if table_name:
    # Process single table
    table_config = config_loader.get_table_config(table_name)
    tables_to_process[table_name] = table_config
else:
    # Process all tables
    tables_to_process = config.get("tables", {})

In [9]:
# Test process one table
tbl_name, tbl_config = list(tables_to_process.items())[0]

logger.info(f"Starting pipeline for table: {tbl_name}")

# Get topic for this table
topic = tbl_config.get("source_topic")
if not topic:
    logger.error(f"No source topic configured for table: {tbl_name}")

# Read from Kafka
logger.info(f"Reading from Kafka topic: {topic}")
bronze_df = kafka_reader.read_stream([topic])

# Write to Delta
writer = BronzeWriter(config, tbl_config)
logger.info(f"Starting stream write for table: {tbl_name}")
writer.write_stream(bronze_df, tbl_name, trigger_interval)

2025-12-24 16:00:07,407 - __main__ - INFO - [BRONZE] Starting pipeline for table: products
2025-12-24 16:00:07,409 - __main__ - INFO - [BRONZE] Reading from Kafka topic: cdc-pipeline.public.products
2025-12-24 16:00:07,411 - cdc_pipelines.pipelines.bronze.kafka_reader - INFO - Reading from Kafka topics: ['cdc-pipeline.public.products']
2025-12-24 16:00:07,411 - cdc_pipelines.pipelines.bronze.kafka_reader - INFO - Bootstrap servers: 10.0.0.2:9092


2025-12-24 16:00:08,884 - cdc_pipelines.pipelines.bronze.kafka_reader - INFO - Successfully connected to Kafka stream
2025-12-24 16:00:08,886 - __main__ - INFO - [BRONZE] Starting stream write for table: products
2025-12-24 16:00:08,887 - cdc_pipelines.pipelines.bronze.bronze_writer - INFO - Writing stream to bronze table: products
2025-12-24 16:00:08,888 - cdc_pipelines.pipelines.bronze.bronze_writer - INFO - Table path: gs://cdc-pipeline-data/uat/bronze/products
2025-12-24 16:00:08,888 - cdc_pipelines.pipelines.bronze.bronze_writer - INFO - Checkpoint location: gs://cdc-pipeline-data/uat/bronze/checkpoints/products


25/12/24 16:00:11 WARN ResolveWriteToStream: spark.sql.adaptive.enabled is not supported in streaming DataFrames/Datasets and will be disabled.


2025-12-24 16:00:13,380 - cdc_pipelines.pipelines.bronze.bronze_writer - INFO - Stream started for table: products
2025-12-24 16:00:13,385 - cdc_pipelines.pipelines.bronze.bronze_writer - INFO - Query ID: 8854b99a-c42c-4755-938b-fe85c89edf71


25/12/24 16:00:31 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000} milliseconds, but spent 17704 milliseconds


2025-12-24 16:00:48,453 - root - ERROR - KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/py4j/clientserver.py", line 535, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/socket.py", line 717, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt
2025-12-24 16:00:48,456 - py4j.java_gateway - INFO - Search for sockets that match local addr ('127.0.0.1', 45030) and remote addr ('127.0.0.1', 36557)
2025-12-24 16:00:48,457 - py4j.java_gateway - INFO - Shutting down matched socket <socket.socket fd=84, family=AddressFamily.AF_INET, type=SocketKind.SOCK_STREAM, proto=0, laddr=('127.0.0.1', 45042), raddr=('127.0.0.

KeyboardInterrupt: 

In [10]:

bronze_path = config['delta']['bronze_path']
product_path = bronze_path + "/products"

# Read the product table from bronze layer
print("Reading products from bronze layer...")
df_products = spark.read.format("delta").load(product_path)

print(f"Total records: {df_products.count()}")
print("\nSchema:")
df_products.printSchema()


Reading products from bronze layer...
2025-12-24 16:00:56,919 - py4j.clientserver - INFO - Error while sending or receiving.
Traceback (most recent call last):
  File "/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/py4j/clientserver.py", line 527, in send_command
    self.socket.sendall(command.encode("utf-8"))
BrokenPipeError: [Errno 32] Broken pipe
2025-12-24 16:00:56,920 - py4j.clientserver - INFO - Closing down clientserver connection
2025-12-24 16:00:56,922 - root - INFO - Exception while sending command.
Traceback (most recent call last):
  File "/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/py4j/clientserver.py", line 527, in send_command
    self.socket.sendall(command.encode("utf-8"))
BrokenPipeError: [Errno 32] Broken pipe

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/py4j/java_gateway.py"

25/12/24 16:00:58 WARN SparkStringUtils: Truncated the string representation of a plan since it was too large. This behavior can be adjusted by setting 'spark.sql.debug.maxToStringFields'.

Total records: 5

Schema:
root
 |-- json_payload: string (nullable = true)
 |-- topic: string (nullable = true)
 |-- partition: integer (nullable = true)
 |-- offset: long (nullable = true)
 |-- kafka_timestamp: timestamp (nullable = true)
 |-- ingestion_timestamp: timestamp (nullable = true)



                                                                                