# Load Bronze Data to Table - ProductCategory

## Overview
Load ProductCategory data from the Bronze CSV file to the Delta table in the lakehouse.

## Data Flow
- **Source**: MAAG_LH_Bronze/Files/samples_fabric/shared/ProductCategory_Samples.csv
- **Target**: shared.ProductCategory (Delta table in lakehouse)
- **Process**: Read CSV, validate schema, load to Delta table

In [None]:
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum
import os

# Configuration - Using correct Fabric cross-lakehouse path from Fabric portal
WORKSPACE_NAME = "Fabric_MAAG"
SOURCE_LAKEHOUSE_NAME = "MAAG_LH_Bronze"
SOURCE_PATH = f"abfss://{WORKSPACE_NAME}@onelake.dfs.fabric.microsoft.com/{SOURCE_LAKEHOUSE_NAME}.Lakehouse/Files/samples_fabric/shared/ProductCategory_Samples.csv"

TARGET_SCHEMA = "shared"
TARGET_TABLE = "ProductCategory"
TARGET_FULL_PATH = f"{TARGET_SCHEMA}.{TARGET_TABLE}"

print(f"🔄 Loading ProductCategory data")
print(f"📂 Source: {SOURCE_PATH}")
print(f"🎯 Target: {TARGET_FULL_PATH}")

# Read CSV from Bronze lakehouse
df = spark.read.option("header", "true").option("inferSchema", "true").csv(SOURCE_PATH)

print(f"✅ Data loaded successfully")
print(f"📊 Records: {df.count()}")
print(f"📋 Columns: {df.columns}")

# Display sample data
print(f"\n📖 Sample data:")
df.show(10, truncate=False)

In [None]:
# Data quality check: Nulls in key columns
nulls = df.select(
    spark_sum(col("CategoryID").isNull().cast("int")).alias("null_CategoryID"),
    spark_sum(col("CategoryName").isNull().cast("int")).alias("null_CategoryName"),
    spark_sum(col("IsActive").isNull().cast("int")).alias("null_IsActive")
).collect()[0]

print(f"🔍 Null check results:")
print(f"  CategoryID nulls:   {nulls['null_CategoryID']}")
print(f"  CategoryName nulls: {nulls['null_CategoryName']}")
print(f"  IsActive nulls:     {nulls['null_IsActive']}")

if nulls['null_CategoryID'] > 0 or nulls['null_CategoryName'] > 0 or nulls['null_IsActive'] > 0:
    print(f"⚠️ Warning: Nulls found in key columns!")
else:
    print(f"✅ No nulls in key columns.")

In [None]:
# Required columns from Model_Shared_Data.ipynb ProductCategory table (BrandLogoUrl is NOT required)
required_columns = [
    "CategoryID", "ParentCategoryId", "CategoryName", "CategoryDescription",
    "BrandName", "IsActive"
]

# Validate required columns
missing_columns = [c for c in required_columns if c not in df.columns]
if missing_columns:
    print(f"⚠️ Warning: Missing columns in source data: {missing_columns}")
else:
    print(f"✅ All required columns present in source data.")

# Add missing columns with default values if needed
from pyspark.sql import functions as F
for col_name in missing_columns:
    if col_name == "IsActive":
        df = df.withColumn(col_name, F.lit(True))
    else:
        df = df.withColumn(col_name, F.lit(""))

# Align DataFrame columns and types to match the Silver table
from pyspark.sql.types import StringType, BooleanType

# Explicitly cast all columns to match the Delta table schema
df = df.withColumn("CategoryID", col("CategoryID").cast(StringType()))
df = df.withColumn("ParentCategoryId", col("ParentCategoryId").cast(StringType()))
df = df.withColumn("CategoryName", col("CategoryName").cast(StringType()))
df = df.withColumn("CategoryDescription", col("CategoryDescription").cast(StringType()))
df = df.withColumn("BrandName", col("BrandName").cast(StringType()))
df = df.withColumn("IsActive", col("IsActive").cast(BooleanType()))
df = df.select(required_columns)

# Print DataFrame schema and Delta table schema for debugging
print("\n📋 DataFrame schema:")
df.printSchema()
print("\n📋 Delta table schema:")
spark.table(TARGET_FULL_PATH).printSchema()

# Check for leading/trailing spaces in column names
print("\n🕵️ Checking DataFrame columns for whitespace:")
print([f"'{c}'" for c in df.columns])

# Optionally, rename columns to remove whitespace
for c in df.columns:
    new_c = c.strip()
    if new_c != c:
        df = df.withColumnRenamed(c, new_c)

# Re-select columns to ensure order and names match exactly
df = df.select(required_columns)

# Data quality checks
print(f"\n📊 Data Quality Check:")
null_counts = df.select([F.sum(col(c).isNull().cast("int")).alias(c) for c in required_columns]).collect()[0]
for col_name in required_columns:
    null_count = null_counts[col_name]
    if null_count > 0:
        print(f"  {col_name}: {null_count} null values")
    else:
        print(f"  {col_name}: ✅ No nulls")

# Show value distributions for CategoryID
print(f"\n🎯 CategoryID Distribution:")
df.groupBy("CategoryID").count().orderBy("CategoryID").show()

# Write DataFrame to Delta table (append mode)
print(f"💾 Writing data to Delta table: {TARGET_FULL_PATH}")

try:
    df.write \
      .format("delta") \
      .mode("append") \
      .option("overwriteSchema", "false") \
      .option("mergeSchema", "false") \
      .saveAsTable(TARGET_FULL_PATH)

    print(f"✅ Data written to Delta table")

    # Verify the load
    result_count = spark.sql(f"SELECT COUNT(*) as count FROM {TARGET_FULL_PATH}").collect()[0]["count"]
    print(f"📊 Records in Delta table: {result_count}")

    print(f"\n📖 Sample from Delta table:")
    spark.sql(f"SELECT * FROM {TARGET_FULL_PATH} ORDER BY CategoryID").show(10, truncate=False)

    print(f"🎉 Bronze to Delta table load complete!")

except Exception as e:
    print(f"❌ Error loading data to table: {str(e)}")
    raise