# Load Bronze Data to Silver Table - Payment (Fabric & ADB)

## Overview
Load Payment sample data from Bronze lakehouse files into Silver lakehouse table for both Fabric and ADB channels.

## Data Flow
- **Source (Fabric)**: MAAG_LH_Bronze/Files/samples_fabric/finance/Payment_Samples_Fabric.csv
- **Source (ADB)**: MAAG_LH_Bronze/Files/samples_databricks/finance/Payment_Samples_ADB.csv
- **Target**: MAAG_LH_Silver.finance.Payment table (or any attached default lakehouse)
- **Process**: Read CSV, validate schema, check data quality, show value distributions, load to Delta table, verify load

---

In [1]:
# --- Fabric Channel ---
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum
from pyspark.sql import functions as F

WORKSPACE_NAME = "Fabric_MAAG"
SOURCE_LAKEHOUSE_NAME = "MAAG_LH_Bronze"
FABRIC_SOURCE_PATH = f"abfss://{WORKSPACE_NAME}@onelake.dfs.fabric.microsoft.com/{SOURCE_LAKEHOUSE_NAME}.Lakehouse/Files/samples_fabric/finance/Payment_Samples_Fabric.csv"

TARGET_SCHEMA = "finance"
TARGET_TABLE = "Payment"
TARGET_FULL_PATH = f"{TARGET_SCHEMA}.{TARGET_TABLE}"

print(f"🔄 Loading Fabric Payment data")
print(f"📂 Source: {FABRIC_SOURCE_PATH}")
print(f"🎯 Target: {TARGET_FULL_PATH}")

# Read CSV from Bronze lakehouse
payment_df = spark.read.option("header", "true").option("inferSchema", "true").csv(FABRIC_SOURCE_PATH)

print(f"✅ Data loaded successfully")
print(f"📊 Records: {payment_df.count()}")
print(f"📋 Columns: {payment_df.columns}")

# Display sample data
print(f"\n📖 Sample data:")
payment_df.show(10, truncate=False)

required_columns = [
    "PaymentId", "PaymentNumber", "InvoiceId", "OrderId", "PaymentDate", "PaymentAmount",
    "PaymentStatus", "PaymentMethod", "CreatedBy"
 ]

missing_columns = [c for c in required_columns if c not in payment_df.columns]
if missing_columns:
    print(f"⚠️ Warning: Missing columns in source data: {missing_columns}")
else:
    print(f"✅ All required columns present in source data.")

for col_name in missing_columns:
    if col_name == "PaymentAmount":
        payment_df = payment_df.withColumn(col_name, F.lit(0.0))
    else:
        payment_df = payment_df.withColumn(col_name, F.lit(""))

payment_df = payment_df.withColumn("PaymentId", col("PaymentId").cast(StringType()))
payment_df = payment_df.withColumn("PaymentNumber", col("PaymentNumber").cast(StringType()))
payment_df = payment_df.withColumn("InvoiceId", col("InvoiceId").cast(StringType()))
payment_df = payment_df.withColumn("OrderId", col("OrderId").cast(StringType()))
payment_df = payment_df.withColumn("PaymentDate", col("PaymentDate").cast(DateType()))
payment_df = payment_df.withColumn("PaymentAmount", col("PaymentAmount").cast(DoubleType()))
payment_df = payment_df.withColumn("PaymentStatus", col("PaymentStatus").cast(StringType()))
payment_df = payment_df.withColumn("PaymentMethod", col("PaymentMethod").cast(StringType()))
payment_df = payment_df.withColumn("CreatedBy", col("CreatedBy").cast(StringType()))
payment_df = payment_df.select(required_columns)

# Data quality checks
print(f"\n📊 Data Quality Check:")
null_counts = payment_df.select([F.sum(col(c).isNull().cast("int")).alias(c) for c in required_columns]).collect()[0]
for col_name in required_columns:
    null_count = null_counts[col_name]
    if null_count > 0:
        print(f"  {col_name}: {null_count} null values")
    else:
        print(f"  {col_name}: ✅ No nulls")

# Show value distributions for PaymentStatus
print(f"\n🎯 PaymentStatus Distribution:")
payment_df.groupBy("PaymentStatus").count().orderBy("PaymentStatus").show()

# Ensure the target schema exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {TARGET_SCHEMA}")

# Load data to Silver table
print(f"💾 Loading data to Silver table: {TARGET_FULL_PATH}")
try:
    payment_df.write \
      .format("delta") \
      .mode("append") \
      .option("overwriteSchema", "true") \
      .saveAsTable(TARGET_FULL_PATH)
    print(f"✅ Data loaded successfully to {TARGET_FULL_PATH}")
    # Verify the load
    result_count = spark.sql(f"SELECT COUNT(*) as count FROM {TARGET_FULL_PATH}").collect()[0]["count"]
    print(f"📊 Records in target table: {result_count}")
    # Show sample of loaded data
    print(f"\n📖 Sample from Silver table:")
    spark.sql(f"SELECT * FROM {TARGET_FULL_PATH} ORDER BY PaymentId").show(10, truncate=False)
    print(f"🎉 Payment data load complete!")
except Exception as e:
    print(f"❌ Error loading data to table: {str(e)}")
    raise
# --- End Fabric Channel ---

StatementMeta(, 20ddaee1-ec73-4388-ac20-f47963819a56, 3, Finished, Available, Finished)

🔄 Loading Fabric Payment data
📂 Source: abfss://Fabric_MAAG@onelake.dfs.fabric.microsoft.com/MAAG_LH_Bronze.Lakehouse/Files/samples_fabric/finance/Payment_Samples_Fabric.csv
🎯 Target: finance.Payment
✅ Data loaded successfully
📊 Records: 1808
📋 Columns: ['PaymentId', 'PaymentNumber', 'InvoiceId', 'CustomerId', 'PaymentDate', 'PaymentAmount', 'PaymentMethod', 'PaymentStatus']

📖 Sample data:
+------------------------------------+-------------+------------------------------------+----------+-----------+-------------+-------------+-------------+
|PaymentId                           |PaymentNumber|InvoiceId                           |CustomerId|PaymentDate|PaymentAmount|PaymentMethod|PaymentStatus|
+------------------------------------+-------------+------------------------------------+----------+-----------+-------------+-------------+-------------+
|7c82dbb4-99bb-4f83-a7d6-f9aa62282c1a|PM-F100000   |547fb079-2c37-456d-b7fc-e0a081bbf04f|CID-001   |2024-03-05 |16696.36     |MC           |C

In [None]:
# --- ADB Channel ---
import pandas as pd
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum
from pyspark.sql import functions as F

WORKSPACE_NAME = "Fabric_MAAG"
SOURCE_LAKEHOUSE_NAME = "MAAG_LH_Bronze"
ADB_SOURCE_PATH = f"abfss://{WORKSPACE_NAME}@onelake.dfs.fabric.microsoft.com/{SOURCE_LAKEHOUSE_NAME}.Lakehouse/Files/samples_databricks/finance/Payment_Samples_ADB.csv"

TARGET_SCHEMA = "finance"
TARGET_TABLE = "Payment"
TARGET_FULL_PATH = f"{TARGET_SCHEMA}.{TARGET_TABLE}"

print(f"🔄 Loading ADB Payment data")
print(f"📂 Source: {ADB_SOURCE_PATH}")
print(f"🎯 Target: {TARGET_FULL_PATH}")

# Read CSV from Bronze lakehouse
payment_df = spark.read.option("header", "true").option("inferSchema", "true").csv(ADB_SOURCE_PATH)

print(f"✅ Data loaded successfully")
print(f"📊 Records: {payment_df.count()}")
print(f"📋 Columns: {payment_df.columns}")

# Display sample data
print(f"\n📖 Sample data:")
payment_df.show(10, truncate=False)

required_columns = [
    "PaymentId", "PaymentNumber", "InvoiceId", "OrderId", "PaymentDate", "PaymentAmount",
    "PaymentStatus", "PaymentMethod", "CreatedBy"
 ]

missing_columns = [c for c in required_columns if c not in payment_df.columns]
if missing_columns:
    print(f"⚠️ Warning: Missing columns in source data: {missing_columns}")
else:
    print(f"✅ All required columns present in source data.")

for col_name in missing_columns:
    if col_name == "PaymentAmount":
        payment_df = payment_df.withColumn(col_name, F.lit(0.0))
    else:
        payment_df = payment_df.withColumn(col_name, F.lit(""))

payment_df = payment_df.withColumn("PaymentId", col("PaymentId").cast(StringType()))
payment_df = payment_df.withColumn("PaymentNumber", col("PaymentNumber").cast(StringType()))
payment_df = payment_df.withColumn("InvoiceId", col("InvoiceId").cast(StringType()))
payment_df = payment_df.withColumn("OrderId", col("OrderId").cast(StringType()))
payment_df = payment_df.withColumn("PaymentDate", col("PaymentDate").cast(DateType()))
payment_df = payment_df.withColumn("PaymentAmount", col("PaymentAmount").cast(DoubleType()))
payment_df = payment_df.withColumn("PaymentStatus", col("PaymentStatus").cast(StringType()))
payment_df = payment_df.withColumn("PaymentMethod", col("PaymentMethod").cast(StringType()))
payment_df = payment_df.withColumn("CreatedBy", col("CreatedBy").cast(StringType()))
payment_df = payment_df.select(required_columns)

# Data quality checks
print(f"\n📊 Data Quality Check:")
null_counts = payment_df.select([F.sum(col(c).isNull().cast("int")).alias(c) for c in required_columns]).collect()[0]
for col_name in required_columns:
    null_count = null_counts[col_name]
    if null_count > 0:
        print(f"  {col_name}: {null_count} null values")
    else:
        print(f"  {col_name}: ✅ No nulls")

# Show value distributions for PaymentStatus
print(f"\n🎯 PaymentStatus Distribution:")
payment_df.groupBy("PaymentStatus").count().orderBy("PaymentStatus").show()

# Ensure the target schema exists
spark.sql(f"CREATE SCHEMA IF NOT EXISTS {TARGET_SCHEMA}")

# Load data to Silver table
print(f"💾 Loading data to Silver table: {TARGET_FULL_PATH}")
try:
    payment_df.write \
      .format("delta") \
      .mode("append") \
      .option("overwriteSchema", "true") \
      .saveAsTable(TARGET_FULL_PATH)
    print(f"✅ Data loaded successfully to {TARGET_FULL_PATH}")
    # Verify the load
    result_count = spark.sql(f"SELECT COUNT(*) as count FROM {TARGET_FULL_PATH}").collect()[0]["count"]
    print(f"📊 Records in target table: {result_count}")
    # Show sample of loaded data
    print(f"\n📖 Sample from Silver table:")
    spark.sql(f"SELECT * FROM {TARGET_FULL_PATH} ORDER BY PaymentId").show(10, truncate=False)
    print(f"🎉 Payment data load complete!")
except Exception as e:
    print(f"❌ Error loading data to table: {str(e)}")
    raise
# --- End ADB Channel ---

StatementMeta(, , -1, SessionError, , SessionError)

InvalidHttpRequestToLivy: [TooManyRequestsForCapacity] This spark job can't be run because you have hit a spark compute or API rate limit. To run this spark job, cancel an active Spark job through the Monitoring hub, choose a larger capacity SKU, or try again later. HTTP status code: 430 {Learn more} HTTP status code: 430.