In [1]:
%load_ext autoreload
%autoreload 2

import sys

sys.path.append("..")

In [2]:
from cdc_pipelines.common.config_loader import ConfigLoader
from cdc_pipelines.common.spark_session import get_spark_session
from cdc_pipelines.common.logger import get_logger
from cdc_pipelines.pipelines.silver.bronze_reader import BronzeReader
from cdc_pipelines.pipelines.silver.silver_writer import SCD2Writer, AppendOnlyWriter


from cdc_pipelines.pipelines.silver.silver_pipeline import SilverPipeline

In [3]:
# Configure logging to see logs in notebook
import logging
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.StreamHandler()],
    force=True
)

In [4]:
env = 'prod'
config_path = "/home/longnguyen/cdc-data-pipeline/config/pipeline_config"

pipeline = SilverPipeline(config_path, environment=env)

2025-12-27 08:00:24,290 - cdc_pipelines.pipelines.silver.silver_pipeline - INFO - [SILVER] Starting silver layer pipeline
2025-12-27 08:00:24,294 - cdc_pipelines.common.spark_session - INFO - Creating new Spark session


:: loading settings :: url = jar:file:/home/longnguyen/miniconda3/envs/spark_env/lib/python3.10/site-packages/pyspark/jars/ivy-2.5.3.jar!/org/apache/ivy/core/settings/ivysettings.xml
Ivy Default Cache set to: /home/longnguyen/.ivy2.5.2/cache
The jars for the packages stored in: /home/longnguyen/.ivy2.5.2/jars
org.apache.spark#spark-sql-kafka-0-10_2.13 added as a dependency
io.delta#delta-spark_2.13 added as a dependency
com.google.cloud.bigdataoss#gcs-connector added as a dependency
:: resolving dependencies :: org.apache.spark#spark-submit-parent-bbb7bef1-6737-4c03-8eb2-6dd413f8e22b;1.0
	confs: [default]
	found org.apache.spark#spark-sql-kafka-0-10_2.13;4.0.0 in central
	found org.apache.spark#spark-token-provider-kafka-0-10_2.13;4.0.0 in central
	found org.apache.kafka#kafka-clients;3.9.0 in central
	found org.lz4#lz4-java;1.8.0 in central
	found org.xerial.snappy#snappy-java;1.1.10.7 in central
	found org.slf4j#slf4j-api;2.0.16 in central
	found org.apache.hadoop#hadoop-client-runti

In [5]:
for table_name, table_config in pipeline.config["tables"].items():
    print(f"Processing table: {table_name} with config: {table_config}")

Processing table: products with config: {'source_topic': 'cdc-pipeline.instacart.products', 'bronze_table': 'bronze.products', 'silver_table': 'silver.products', 'table_type': 'dimension', 'business_keys': ['product_id'], 'surrogate_key': 'product_sk', 'attribute_columns': ['product_name', 'aisle_id', 'department_id']}
Processing table: aisles with config: {'source_topic': 'cdc-pipeline.instacart.aisles', 'bronze_table': 'bronze.aisles', 'table_type': 'dimension', 'business_keys': ['aisle_id'], 'surrogate_key': 'aisle_sk', 'attribute_columns': ['aisle']}
Processing table: departments with config: {'source_topic': 'cdc-pipeline.instacart.departments', 'bronze_table': 'bronze.departments', 'table_type': 'dimension', 'business_keys': ['department_id'], 'surrogate_key': 'department_sk', 'attribute_columns': ['department']}
Processing table: orders with config: {'source_topic': 'cdc-pipeline.instacart.orders', 'bronze_table': 'bronze.orders', 'silver_table': 'silver.orders', 'table_type': '

In [None]:
table_name = "products"
table_config = pipeline.config["tables"][table_name]

display(table_config)

pipeline.process_dimension_table(table_name, table_config)

{'source_topic': 'cdc-pipeline.instacart.products',
 'bronze_table': 'bronze.products',
 'silver_table': 'silver.products',
 'table_type': 'dimension',
 'business_keys': ['product_id'],
 'surrogate_key': 'product_sk',
 'attribute_columns': ['product_name', 'aisle_id', 'department_id']}

2025-12-27 08:05:44,547 - cdc_pipelines.pipelines.silver.silver_pipeline - INFO - [SILVER] Processing dimension table: products
2025-12-27 08:05:44,549 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Reading and preparing bronze data for table: products (batch_mode=False)
2025-12-27 08:05:44,550 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Reading bronze stream from: gs://cdc-pipeline-data/prod/bronze/products
2025-12-27 08:05:44,633 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Bronze stream loaded for table: products
2025-12-27 08:05:44,634 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Parsing JSON payload for table: products
2025-12-27 08:05:44,790 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - JSON payload parsed for table: products
2025-12-27 08:05:44,792 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Applying watermark on cdc_timestamp with threshold: 1 hour
2025-12-27 08:05:44,800 - cdc_pipelines.pipelines.silver.bronze_re

2025-12-27 08:05:50,073 - py4j.clientserver - INFO - Python Server ready to receive messages
2025-12-27 08:05:50,076 - py4j.clientserver - INFO - Received command c on object id p1
2025-12-27 08:05:50,080 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Processing batch 0 for table: products
2025-12-27 08:05:50,362 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Performing initial load for table: products
2025-12-27 08:05:59,414 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Initial load completed for table: products
2025-12-27 08:05:59,416 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Batch 0 processed successfully for products
25/12/27 08:05:59 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000} milliseconds, but spent 13698 milliseconds


In [9]:
table_name = "orders"
table_config = pipeline.config["tables"][table_name]

display(table_config)


pipeline.process_fact_table(table_name, table_config)

{'source_topic': 'cdc-pipeline.instacart.orders',
 'bronze_table': 'bronze.orders',
 'silver_table': 'silver.orders',
 'table_type': 'fact',
 'primary_keys': ['order_id'],
 'attribute_columns': ['user_id',
  'order_number',
  'order_dow',
  'order_hour_of_day',
  'days_since_prior_order']}

2025-12-27 07:16:23,748 - cdc_pipelines.pipelines.silver.silver_pipeline - INFO - [SILVER] Processing fact table: orders
2025-12-27 07:16:23,749 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Reading and preparing bronze data for table: orders (batch_mode=False)
2025-12-27 07:16:23,751 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Reading bronze stream from: gs://cdc-pipeline-data/prod/bronze/orders
2025-12-27 07:16:23,966 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Bronze stream loaded for table: orders
2025-12-27 07:16:23,968 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Parsing JSON payload for table: orders
2025-12-27 07:16:24,134 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - JSON payload parsed for table: orders
2025-12-27 07:16:24,140 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Applying watermark on cdc_timestamp with threshold: 1 hour
2025-12-27 07:16:24,146 - cdc_pipelines.pipelines.silver.bronze_reader - INFO - Wat

2025-12-27 07:16:36,257 - py4j.clientserver - INFO - Python Server ready to receive messages
2025-12-27 07:16:36,260 - py4j.clientserver - INFO - Received command c on object id p2
2025-12-27 07:16:36,263 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Processing batch 0 for table: orders
2025-12-27 07:16:36,401 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Performing initial load for table: orders
2025-12-27 07:16:42,099 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Initial load completed for table: orders
2025-12-27 07:16:42,100 - cdc_pipelines.pipelines.silver.silver_writer - INFO - Batch 0 processed successfully for orders
25/12/27 07:16:42 WARN ProcessingTimeExecutor: Current batch is falling behind. The trigger interval is 10000} milliseconds, but spent 17081 milliseconds
