# CSV to Delta Lake Converter

This notebook converts CSV files from the landing zone to **Delta Lake format** in the bronze layer.

**Important**: This creates Delta Lake tables that are compatible with your streaming CDC pipeline.

In [None]:
# pip install -r requirements.txt

import os
from pyspark.sql import SparkSession

jar_dir = "/home/bnguyen/Desktop/DE_project/scripts/jars"
os.environ["PYSPARK_SUBMIT_ARGS"] = (
    f"--jars {jar_dir}/hadoop-azure-3.3.6.jar,"
    f"{jar_dir}/hadoop-common-3.3.6.jar,"
    f"{jar_dir}/azure-storage-8.6.6.jar,"
    f"{jar_dir}/jetty-client-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-http-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-io-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-ajax-9.4.43.v20210629.jar "
    "pyspark-shell"
)

In [None]:
# Init spark session with Delta Lake support
spark = SparkSession.builder \
    .appName("CSV to Delta Lake Converter") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print("✅ Spark session with Delta Lake support created!")

In [None]:
# Landing zone access key
spark.conf.set(
    "fs.azure.account.key.kho.blob.core.windows.net",
    "DV9ioy7PXW5gTHcotPx6/ILeKjue7zHgGoNYkt6dvPjqK5XyfK4DVXzfFxmIVGWNj6GVU8Q3a0xl+AStBjZBqg=="
)

# Bronze access key
spark.conf.set(
    "fs.azure.account.key.mybronze.dfs.core.windows.net",
    "c5etqTidViezB/4ukOAALy23HeMBsJJ8g+2nFaIdbC7E9PhLw0y2YIA1ItjutpqS1/8Ga8fw40mR+ASt2T+/sw=="
)

In [None]:
# Define storage accounts and containers
storage_account_land = "kho"
storage_account_bronze = "mybronze"
landing_container = "online-store"
bronze_container = "bronze-delta"

# Tables to convert
tables = [
    "Sellers", "Customers", "ProductCategories", "Products", "OrderStatus",
    "Orders", "Reasons", "OrderItems", "ShoppingCarts", "CartItems",
    "PaymentMethods", "Payments", "Reviews", "Addresses", "Inventory"
]

print(f"📋 Ready to convert {len(tables)} tables from CSV to Delta Lake format")

In [None]:
# Convert CSV files to Delta Lake tables
success_count = 0
error_count = 0

for table in tables:
    csv_path = f"wasbs://{landing_container}@{storage_account_land}.blob.core.windows.net/{table}"
    delta_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/{table}"

    print(f"🔄 Processing {table}...")

    try:
        # Read CSV file
        df = spark.read.option("header", "true").csv(csv_path)
        record_count = df.count()
        
        # Show schema for first table as example
        if table == tables[0]:
            print(f"   📋 Sample schema for {table}:")
            df.printSchema()
        
        # Write as Delta Lake table
        df.write.format("delta").mode("overwrite").save(delta_path)
        
        print(f"   ✅ Saved {table} as Delta Lake table ({record_count:,} records)")
        success_count += 1
        
    except Exception as e:
        print(f"   ❌ Error processing {table}: {e}")
        error_count += 1

print(f"\n Conversion completed")
print(f"   Success: {success_count} tables")
print(f"    Errors: {error_count} tables")
print(f"\n All successful tables are now in Delta Lake format and ready for CDC streaming")