In [6]:
# pip install -r requirements.txt

import os
from pyspark.sql import SparkSession

jar_dir = "/home/bnguyen/Desktop/DE_project/scripts/jars"
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    f"--jars {jar_dir}/hadoop-common-3.3.6.jar,"  # <-- Add this line
    f"{jar_dir}/hadoop-azure-3.3.6.jar,"
    f"{jar_dir}/azure-storage-8.6.6.jar,"
    f"{jar_dir}/jetty-client-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-http-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-io-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-ajax-9.4.43.v20210629.jar "
    "pyspark-shell"
)


In [7]:
# Init spark session
spark = SparkSession.builder \
    .appName("Bronze to Silver: Table filter") \
    .getOrCreate()

In [8]:
# Bronze access key
spark.conf.set(
    "fs.azure.account.key.mybronze.dfs.core.windows.net",
    "c5etqTidViezB/4ukOAALy23HeMBsJJ8g+2nFaIdbC7E9PhLw0y2YIA1ItjutpqS1/8Ga8fw40mR+ASt2T+/sw=="
)

# Silver access key
spark.conf.set(
    "fs.azure.account.key.mysilver.dfs.core.windows.net",
    "bAthp0pVBfqEtyCvJElSX7MeI7ejSLa6cjuPoMz0Gg/69uzEW01y4URMDXsdFCrkpc9M54cDHnXs+AStj1gExQ=="
)

In [9]:
# Define storage account 
storage_account_bronze = "mybronze"
bronze_container = "bronze"
storage_account_silver = "mysilver"
silver_container = "silver"

tables = ["Customers", "Products", 
          "Sellers", "Orders", 
          "OrderItems","ProductCategories",
          "OrderStatus","Reasons", "Payments"]


In [7]:
bronze_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/Customers"
df = spark.read.parquet(bronze_path)

                                                                                

25/07/17 12:05:16 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [8]:
# https://mybronze.blob.core.windows.net/bronze/Customers/ingestion_timestamp=2025-07-16%2009%253A35%253A10.005/
df.show()

                                                                                

+----------+----------------+--------------------+--------------------+-------------------+-------------------+
|CustomerID|            Name|               Email|         PhoneNumber|          CreatedAt|          UpdatedAt|
+----------+----------------+--------------------+--------------------+-------------------+-------------------+
|         1|    Charles Park| tmiller@example.com|  793-701-5921x92794|2024-02-26 17:56:18|2024-11-20 20:17:14|
|         2| Michael Estrada|  rbrown@example.org|          6953312991|2023-12-09 16:51:08|2025-06-28 11:45:07|
|         3|  Gail Wilkerson|graybrittany@exam...|        550.265.9882|2024-02-09 17:10:59|2025-05-26 07:26:49|
|         4| Alexandra Moyer|  mark96@example.org|     +1-978-296-4775|2024-11-26 14:08:20|2024-09-27 03:46:33|
|         5|   Natasha Perry| katie74@example.com|        457.834.5540|2024-11-17 05:40:49|2024-07-17 05:36:49|
|         6|Rachel Wilkerson|  rsmith@example.com|        846.591.5494|2025-05-08 17:12:32|2025-06-21 09

In [8]:
for table in tables:
    bronze_path = f"wasbs://{bronze_container}@{storage_account_bronze}.blob.core.windows.net/{table}"
    silver_path = f"abfss://{silver_container}@{storage_account_silver}.dfs.core.windows.net/{table}"

    print(f"Processing {table}...")
    
    df = spark.read.parquet(bronze_path)
    df.write.mode("overwrite").parquet(silver_path)

    print(f"Written to {silver_path}")

    
    

Processing Customers...


25/07/09 12:39:32 WARN MetricsConfig: Cannot locate configuration: tried hadoop-metrics2-azure-file-system.properties,hadoop-metrics2.properties
                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/Customers
Processing Products...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/Products
Processing Sellers...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/Sellers
Processing Orders...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/Orders
Processing OrderItems...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/OrderItems
Processing ProductCategories...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/ProductCategories
Processing OrderStatus...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/OrderStatus
Processing Reasons...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/Reasons
Processing Payments...


                                                                                

Written to abfss://silver@mysilver.dfs.core.windows.net/Payments


In [10]:
# Copy content for just the Customers table
# This will create the initial baseline for the Customers table in bronze-final

# Configure paths
table = "Customers"
bronze_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/{table}"
bronze_final_path = f"abfss://bronze-final@{storage_account_bronze}.dfs.core.windows.net/{table}"

print(f"Processing {table}...")

try:
    # Read from bronze
    df = spark.read.parquet(bronze_path)
    record_count = df.count()
    
    print(f"  📊 Found {record_count} records in bronze")
    
    # Show sample data
    print("  📋 Sample data:")
    df.show(5)
    
    # Write to bronze-final with overwrite mode
    df.write.mode("overwrite").parquet(bronze_final_path)
    
    print(f"  ✓ Successfully copied {record_count} records to bronze-final")
    print(f"  📁 Destination: {bronze_final_path}")
    
except Exception as e:
    print(f"  ✗ Error processing {table}: {e}")

print(f"\n{table} table is ready for continuous CDC merging!")

Processing Customers...
  📊 Found 7000 records in bronze
  📋 Sample data:


                                                                                

+----------+---------------+--------------------+------------------+-------------------+-------------------+
|CustomerID|           Name|               Email|       PhoneNumber|          CreatedAt|          UpdatedAt|
+----------+---------------+--------------------+------------------+-------------------+-------------------+
|         1|   Charles Park| tmiller@example.com|793-701-5921x92794|2024-02-26 17:56:18|2024-11-20 20:17:14|
|         2|Michael Estrada|  rbrown@example.org|        6953312991|2023-12-09 16:51:08|2025-06-28 11:45:07|
|         3| Gail Wilkerson|graybrittany@exam...|      550.265.9882|2024-02-09 17:10:59|2025-05-26 07:26:49|
|         4|Alexandra Moyer|  mark96@example.org|   +1-978-296-4775|2024-11-26 14:08:20|2024-09-27 03:46:33|
|         5|  Natasha Perry| katie74@example.com|      457.834.5540|2024-11-17 05:40:49|2024-07-17 05:36:49|
+----------+---------------+--------------------+------------------+-------------------+-------------------+
only showing top 5 

                                                                                

  ✓ Successfully copied 7000 records to bronze-final
  📁 Destination: abfss://bronze-final@mybronze.dfs.core.windows.net/Customers

Customers table is ready for continuous CDC merging!


In [None]:
# Initial merge: Copy bronze data to bronze-final and silver-final
# This creates the baseline for the continuous merge process

# Configure bronze-final storage
storage_account_bronze_final = "mybronze"
storage_account_silver_final = "mysilver"
bronze_final_container = "bronze-final"
silver_final_container = "silver-final"

# Set access key for bronze-final (same as bronze)
spark.conf.set(
    "fs.azure.account.key.mybronze.dfs.core.windows.net",
    "c5etqTidViezB/4ukOAALy23HeMBsJJ8g+2nFaIdbC7E9PhLw0y2YIA1ItjutpqS1/8Ga8fw40mR+ASt2T+/sw=="
)

spark.conf.set(
        "fs.azure.account.key.mysilver.dfs.core.windows.net",
        "bAthp0pVBfqEtyCvJElSX7MeI7ejSLa6cjuPoMz0Gg/69uzEW01y4URMDXsdFCrkpc9M54cDHnXs+AStj1gExQ=="
    )

for table in tables:
    bronze_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/{table}"
    bronze_final_path = f"abfss://{bronze_final_container}@{storage_account_bronze_final}.dfs.core.windows.net/{table}"
    silver_final_path = f"abfss://{silver_final_container}@{storage_account_silver_final}.dfs.core.windows.net/{table}"

    print(f"Processing {table}...")
    
    try:
        # Read from bronze
        df = spark.read.parquet(bronze_path)
        record_count = df.count()
        
        # Write with overwrite mode
        df.write.mode("overwrite").parquet(bronze_final_path)
        df.write.mode("overwrite").parquet(silver_final_path)
        
        print(f"  ✓ Copied {record_count} records")
        
    except Exception as e:
        print(f"  ✗ Error processing {table}: {e}")

print("\nInitial merge completed!")

Processing Customers...


                                                                                

  ✓ Copied 7000 records to bronze-final
Processing Products...


                                                                                

  ✓ Copied 6000 records to bronze-final
Processing Sellers...


                                                                                

  ✓ Copied 4000 records to bronze-final
Processing Orders...


                                                                                

  ✓ Copied 100000 records to bronze-final
Processing OrderItems...


                                                                                

  ✓ Copied 300756 records to bronze-final
Processing ProductCategories...


                                                                                

  ✓ Copied 10 records to bronze-final
Processing OrderStatus...


                                                                                

  ✓ Copied 5 records to bronze-final
Processing Reasons...


                                                                                

  ✓ Copied 4931 records to bronze-final
Processing Payments...


                                                                                

  ✓ Copied 80200 records to bronze-final

Initial merge completed!
Bronze-final containers are ready for continuous CDC merging.
