In [1]:
# pip install -r requirements.txt
# python -m ipykernel install --user --name=python3

import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, dayofmonth, month, quarter, year, dayofweek, date_format, sum as spark_sum

jar_dir = "/home/bnguyen/Desktop/DE_project/scripts/jars"
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    f"--jars {jar_dir}/hadoop-azure-3.3.6.jar,"
    f"{jar_dir}/azure-storage-8.6.6.jar,"
    f"{jar_dir}/hadoop-common-3.3.6.jar,"
    f"{jar_dir}/jetty-client-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-http-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-io-9.4.43.v20210629.jar,"
    f"{jar_dir}/mysql-connector-j-9.3.0.jar,"
    f"{jar_dir}/jetty-util-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-ajax-9.4.43.v20210629.jar "
    "pyspark-shell"
)


In [2]:
# Init spark session
spark = SparkSession.builder \
    .appName("Test CDC") \
    .getOrCreate()

25/07/21 09:08:02 WARN Utils: Your hostname, lenovo-slim resolves to a loopback address: 127.0.1.1; using 192.168.199.13 instead (on interface wlp2s0)
25/07/21 09:08:02 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
25/07/21 09:08:12 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
25/07/21 09:08:13 WARN Utils: Service 'SparkUI' could not bind on port 4040. Attempting port 4041.


In [3]:
# Bronze access key
spark.conf.set(
    "fs.azure.account.key.mybronze.dfs.core.windows.net",
    "c5etqTidViezB/4ukOAALy23HeMBsJJ8g+2nFaIdbC7E9PhLw0y2YIA1ItjutpqS1/8Ga8fw40mR+ASt2T+/sw=="
)

In [4]:
# Define storage account 
storage_account_bronze = "mybronze"
bronze_stream = "bronze-stream"
bronze_final = "bronze-final"
bronze_container = "bronze"

tables = ["Customers", "Products", 
          "Sellers", "Orders", 
          "OrderItems","ProductCategories",
          "OrderStatus","Reasons", "Payments"]


25/07/21 09:08:29 WARN GarbageCollectionMetrics: To enable non-built-in garbage collector(s) List(G1 Concurrent GC), users should configure it(them) to spark.eventLog.gcMetrics.youngGenerationGarbageCollectors or spark.eventLog.gcMetrics.oldGenerationGarbageCollectors


In [20]:
bronze_path = f"abfss://{bronze_final}@{storage_account_bronze}.dfs.core.windows.net/Customers/deleted_10000004.marker"
df = spark.read.parquet(bronze_path)
df.show()

+----------+----------+-------+
|deleted_id|table_name| status|
+----------+----------+-------+
|  10000004| Customers|deleted|
+----------+----------+-------+



                                                                                

In [9]:
# INSERT INTO Customers (CustomerID, Name, Email, PhoneNumber) VALUES (10000004, 'Alice Ng', 'alice@example.com', '0123456789');
# DELETE FROM Customers WHERE CustomerID = 10000004;



# Show record with CustomerID = 10000004 from bronze-final
bronze_path = f"abfss://{bronze_final}@{storage_account_bronze}.dfs.core.windows.net/Customers/"
df = spark.read.parquet(bronze_path)
df.filter(df.CustomerID == 10000004).show()

[Stage 61:>                                                         (0 + 1) / 1]

+----------+----+-----------------+-----------+--------------------+--------------------+
|CustomerID|Name|            Email|PhoneNumber|           CreatedAt|           UpdatedAt|
+----------+----+-----------------+-----------+--------------------+--------------------+
|  10000004| ABC|alice@example.com| 0123456789|2025-07-21T02:22:32Z|2025-07-21T02:25:03Z|
+----------+----+-----------------+-----------+--------------------+--------------------+



                                                                                

In [5]:
# Initial merge: Copy bronze data to bronze-final and silver-final
# This creates the baseline for the continuous merge process

# Configure bronze-final storage
storage_account_bronze_final = "mybronze"
storage_account_silver_final = "mysilver"
bronze_final_container = "bronze-final"
silver_final_container = "silver-final"

# Set access key for bronze-final (same as bronze)
spark.conf.set(
    "fs.azure.account.key.mybronze.dfs.core.windows.net",
    "c5etqTidViezB/4ukOAALy23HeMBsJJ8g+2nFaIdbC7E9PhLw0y2YIA1ItjutpqS1/8Ga8fw40mR+ASt2T+/sw=="
)

spark.conf.set(
        "fs.azure.account.key.mysilver.dfs.core.windows.net",
        "bAthp0pVBfqEtyCvJElSX7MeI7ejSLa6cjuPoMz0Gg/69uzEW01y4URMDXsdFCrkpc9M54cDHnXs+AStj1gExQ=="
    )

for table in tables:
    bronze_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/{table}"
    bronze_final_path = f"abfss://{bronze_final_container}@{storage_account_bronze_final}.dfs.core.windows.net/{table}"
    silver_final_path = f"abfss://{silver_final_container}@{storage_account_silver_final}.dfs.core.windows.net/{table}"

    print(f"Processing {table}...")
    
    try:
        # Read from bronze
        df = spark.read.parquet(bronze_path)
        record_count = df.count()
        
        # Write with overwrite mode
        df.write.mode("overwrite").parquet(bronze_final_path)
        df.write.mode("overwrite").parquet(silver_final_path)
        
        print(f"  ✓ Copied {record_count} records")
        
    except Exception as e:
        print(f"  ✗ Error processing {table}: {e}")

print("\nInitial merge completed!")

Processing Customers...


                                                                                

  ✓ Copied 7000 records
Processing Products...


                                                                                

  ✓ Copied 6000 records
Processing Sellers...


                                                                                

  ✓ Copied 4000 records
Processing Orders...


                                                                                

  ✓ Copied 100000 records
Processing OrderItems...


                                                                                

  ✓ Copied 300756 records
Processing ProductCategories...


                                                                                

  ✓ Copied 10 records
Processing OrderStatus...


                                                                                

  ✓ Copied 5 records
Processing Reasons...


                                                                                

  ✓ Copied 4931 records
Processing Payments...


                                                                                

  ✓ Copied 80200 records

Initial merge completed!


In [None]:
# Copy content for just the Customers table
# This will create the initial baseline for the Customers table in bronze-final

# Configure paths
table = "Customers"
bronze_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/{table}"
bronze_final_path = f"abfss://bronze-final@{storage_account_bronze}.dfs.core.windows.net/{table}"
silver_final_path = f"abfss://{silver_final_container}@{storage_account_silver_final}.dfs.core.windows.net/{table}"

print(f"Processing {table}...")

try:
    # Read from bronze
    df = spark.read.parquet(bronze_path)
    record_count = df.count()
    
    print(f"  📊 Found {record_count} records in bronze")
    
    # Show sample data
    print("  📋 Sample data:")
    df.show(5)
    
    # Write to bronze-final with overwrite mode
    df.write.mode("overwrite").parquet(bronze_final_path)
    df.write.mode("overwrite").parquet(silver_final_path)
    
    print(f"  ✓ Successfully copied {record_count} records")
    
except Exception as e:
    print(f"  ✗ Error processing {table}: {e}")

print(f"\n{table} table is ready for continuous CDC merging!")

Processing Customers...


                                                                                

  📊 Found 7000 records in bronze
  📋 Sample data:


                                                                                

+----------+---------------+--------------------+------------------+-------------------+-------------------+
|CustomerID|           Name|               Email|       PhoneNumber|          CreatedAt|          UpdatedAt|
+----------+---------------+--------------------+------------------+-------------------+-------------------+
|         1|   Charles Park| tmiller@example.com|793-701-5921x92794|2024-02-26 17:56:18|2024-11-20 20:17:14|
|         2|Michael Estrada|  rbrown@example.org|        6953312991|2023-12-09 16:51:08|2025-06-28 11:45:07|
|         3| Gail Wilkerson|graybrittany@exam...|      550.265.9882|2024-02-09 17:10:59|2025-05-26 07:26:49|
|         4|Alexandra Moyer|  mark96@example.org|   +1-978-296-4775|2024-11-26 14:08:20|2024-09-27 03:46:33|
|         5|  Natasha Perry| katie74@example.com|      457.834.5540|2024-11-17 05:40:49|2024-07-17 05:36:49|
+----------+---------------+--------------------+------------------+-------------------+-------------------+
only showing top 5 

                                                                                

  ✓ Successfully copied 7000 records to bronze-final
  📁 Destination: abfss://bronze-final@mybronze.dfs.core.windows.net/Customers

Customers table is ready for continuous CDC merging!
