# CSV to Delta Lake Converter

This notebook converts CSV files from the landing zone to **Delta Lake format** in the bronze layer.

**Important**: This creates Delta Lake tables that are compatible with your streaming CDC pipeline.

In [None]:
import os
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Define the path to the jar files
jar_dir = "/path/to/your/jars"

# PYSPARK configuration for Azure Blob Storage integration with Delta Lake
os.environ['PYSPARK_SUBMIT_ARGS'] = (
    f"--jars {jar_dir}/hadoop-azure-3.3.6.jar,"
    f"{jar_dir}/azure-storage-8.6.6.jar,"
    f"{jar_dir}/jetty-client-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-http-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-io-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-9.4.43.v20210629.jar,"
    f"{jar_dir}/jetty-util-ajax-9.4.43.v20210629.jar "
    "pyspark-shell"
)

In [None]:
# Init spark session with Delta Lake support
spark = SparkSession.builder \
    .appName("CSV to Delta Lake Converter") \
    .config("spark.sql.adaptive.enabled", "true") \
    .config("spark.sql.adaptive.coalescePartitions.enabled", "true") \
    .config("spark.jars.packages", "io.delta:delta-spark_2.12:3.0.0") \
    .config("spark.sql.extensions", "io.delta.sql.DeltaSparkSessionExtension") \
    .config("spark.sql.catalog.spark_catalog", "org.apache.spark.sql.delta.catalog.DeltaCatalog") \
    .getOrCreate()

print("Spark session with Delta Lake support created!")

In [None]:
# Landing zone access key
spark.conf.set(
    "fs.azure.account.key.YOUR_LANDING_ZONE.blob.core.windows.net",
    "access_key_for_landing_zone"  # Replace with your actual access key
)

# Bronze access key
spark.conf.set(
    "fs.azure.account.key.YOUR_BRONZE.blob.core.windows.net",
    "access_key_for_bronze"  # Replace with your actual access key
)

In [None]:
# Define storage accounts and containers
storage_account_land = "YOUR_LANDING_STORAGE_ACCOUNT"
storage_account_bronze = "YOUR_BRONZE_STORAGE_ACCOUNT"
landing_container = "online-store"
bronze_container = "bronze-delta"

# Tables to convert
tables = [
    "Sellers", "Customers", "ProductCategories", "Products", "OrderStatus",
    "Orders", "Reasons", "OrderItems", "ShoppingCarts", "CartItems",
    "Addresses", "Inventory", "Payments", "PaymentMethods", "Reviews"
]

print(f"Ready to convert {len(tables)} tables from CSV to Delta Lake format")
print(f"Source: Landing zone (CSV files)")
print(f"Target: Bronze layer (Delta Lake format)")

In [None]:
# Convert CSV files to Delta Lake format
success_count = 0
error_count = 0

for table in tables:
    csv_path = f"wasbs://{landing_container}@{storage_account_land}.blob.core.windows.net/{table}"
    delta_path = f"abfss://{bronze_container}@{storage_account_bronze}.dfs.core.windows.net/{table}"

    print(f"Processing {table}...")

    try:
        # Read CSV file
        df = spark.read.option("header", "true").option("inferSchema", "true").csv(csv_path)
        record_count = df.count()
        
        if record_count > 0:
            print(f"   Sample schema for {table}:")
            df.printSchema()
            
            # Write to Delta Lake
            df.write.format("delta").mode("overwrite").save(delta_path)
            print(f"   Successfully converted {record_count} records to Delta format")
            success_count += 1
        else:
            print(f"   Warning: {table} is empty")
            error_count += 1
    
    except Exception as e:
        print(f"   Error processing {table}: {e}")
        error_count += 1

print(f"\nConversion Summary:")
print(f"Successfully converted: {success_count} tables")
print(f"Errors encountered: {error_count} tables")