# Load Bronze Data to ADB Table - Orders (ADB Channel)

## Overview
Load Order sample data from Bronze lakehouse CSV file into Silver table in Databricks workspace catalog.

## Data Flow
- **Source (CSV)**: `DBFS sales/Order_Samples_ADB.csv`
- **Target**: Workspace catalog `sales.order`
- **Process**: Read CSV, validate schema, check data quality, load to Delta table, verify load

---

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum
import os

In [None]:
# Define variables for schema, table name, and file path
# Widgets (so you can configure from UI/job)
dbutils.widgets.text("catalog_name", "maag_adb2")
dbutils.widgets.text("schema_name", "sales")
dbutils.widgets.text("base_path", "/FileStore/tables/sales")

catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")
base_path = dbutils.widgets.get("base_path")
table_order = 'order'
order_csv = f'{base_path}/Order_Samples_ADB.csv'
target_full_path = f'{catalog_name}.{schema_name}.{table_order}'

In [None]:
# Diagnostic: Check CSV file existence in DBFS
print(f"Checking files in base_path: {base_path}")
dbutils.fs.ls(base_path)
print(f"Looking for file: {order_csv}")

In [None]:
# Read CSV into DataFrame
print(f'🔄 Loading Order data')
print(f'📂 Source: {base_path}')
print(f'📂 Source: {order_csv}')
print(f'🎯 Target: {target_full_path}')

order_df = spark.read.option('header', True).option('inferSchema', True).csv(order_csv)

print(f'✅ Data loaded successfully')
print(f'📊 Records: {order_df.count()}')
print(f'📋 Columns: {order_df.columns}')

# Display sample data
print(f"\n📖 Sample data:")
order_df.show(10, truncate=False)

In [None]:
# Validate and confirm to target schema
print(f'🔍 Validating data quality...')

required_columns = [
    'OrderId', 'SalesChannelId', 'OrderNumber', 'CustomerId', 'CustomerAccountId',
    'OrderDate', 'OrderStatus', 'SubTotal', 'TaxAmount', 'OrderTotal',
    'PaymentMethod', 'IsoCurrencyCode', 'CreatedBy'
]

missing_columns = [c for c in required_columns if c not in order_df.columns]
if missing_columns:
    print(f'⚠️ Warning: Missing columns in source data: {missing_columns}')
else:
    print(f'✅ All required columns present in source data.')

from pyspark.sql import functions as F
for col_name in missing_columns:
    if col_name in ['SubTotal', 'TaxAmount', 'OrderTotal']:
        order_df = order_df.withColumn(col_name, F.lit(0.0))
    elif col_name == 'CreatedBy':
        order_df = order_df.withColumn(col_name, F.lit('Script'))
    else:
        order_df = order_df.withColumn(col_name, F.lit(''))

from pyspark.sql.types import StringType, DoubleType, DateType
order_df = order_df.withColumn('OrderId', col('OrderId').cast(StringType()))
order_df = order_df.withColumn('SalesChannelId', col('SalesChannelId').cast(StringType()))
order_df = order_df.withColumn('OrderNumber', col('OrderNumber').cast(StringType()))
order_df = order_df.withColumn('CustomerId', col('CustomerId').cast(StringType()))
order_df = order_df.withColumn('CustomerAccountId', col('CustomerAccountId').cast(StringType()))
order_df = order_df.withColumn('OrderDate', col('OrderDate').cast(DateType()))
order_df = order_df.withColumn('OrderStatus', col('OrderStatus').cast(StringType()))
order_df = order_df.withColumn('SubTotal', col('SubTotal').cast(DoubleType()))
order_df = order_df.withColumn('TaxAmount', col('TaxAmount').cast(DoubleType()))
order_df = order_df.withColumn('OrderTotal', col('OrderTotal').cast(DoubleType()))
order_df = order_df.withColumn('PaymentMethod', col('PaymentMethod').cast(StringType()))
order_df = order_df.withColumn('IsoCurrencyCode', col('IsoCurrencyCode').cast(StringType()))
order_df = order_df.withColumn('CreatedBy', F.when(col('CreatedBy').isNull() | (col('CreatedBy') == ''), 'Script').otherwise(col('CreatedBy')).cast(StringType()))
order_df = order_df.select(required_columns)

print(f'\n📊 Data Quality Check:')
null_counts = order_df.select([F.sum(col(c).isNull().cast('int')).alias(c) for c in required_columns]).collect()[0]
for col_name in required_columns:
    null_count = null_counts[col_name]
    if null_count > 0:
        print(f'  {col_name}: {null_count} null values')
    else:
        print(f'  {col_name}: ✅ No nulls')

print(f'\n🎯 OrderStatus Distribution:')
order_df.groupBy('OrderStatus').count().orderBy('OrderStatus').show()

print(f'💾 Loading data to databricks table: {target_full_path}')
try:
    order_df.write \
        .format('delta') \
        .mode('overwrite') \
        .option('overwriteSchema', 'true') \
        .saveAsTable(target_full_path)
    print(f'✅ Data loaded successfully to {target_full_path}')

    result_count = spark.sql(f'SELECT COUNT(*) as count FROM {target_full_path}').collect()[0]['count']
    print(f'📊 Records in target table: {result_count}')
    print(f'\n📖 Sample from Silver table:')
    
    spark.sql(f'SELECT * FROM {target_full_path} ORDER BY OrderId').show(10, truncate=False)
    print(f'🎉 Order data load complete!')
except Exception as e:
    print(f'❌ Error loading data to table: {str(e)}')
    raise