# Load Bronze Data to Silver Table - OrderLine (ADB Channel)

## Overview
Load OrderLine sample data from Bronze lakehouse CSV file into Silver table in Databricks workspace catalog.

## Data Flow
- **Source (CSV)**: `DBFS sales/OrderLine_Samples_ADB.csv`
- **Target**: Workspace catalog table: `sales.orderline`
- **Process**: Read CSV, validate schema, check data quality, load to Delta table, verify load

---

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.types import *
from pyspark.sql.functions import col, sum as spark_sum
import os

In [None]:
# Define variables for schema, table name, and file path
dbutils.widgets.text("catalog_name", "maag_adb2")
dbutils.widgets.text("schema_name", "sales")
dbutils.widgets.text("base_path", "/FileStore/tables/sales")

catalog_name = dbutils.widgets.get("catalog_name")
schema_name = dbutils.widgets.get("schema_name")
base_path = dbutils.widgets.get("base_path")
table_orderline = 'orderline'
orderline_csv = f'{base_path}/OrderLine_Samples_ADB.csv'
target_full_path = f'{catalog_name}.{schema_name}.{table_orderline}'

In [None]:
# Read CSV into DataFrame
print(f'🔄 Loading OrderLine data')
print(f'📂 Source: {orderline_csv}')
print(f'🎯 Target: {target_full_path}')

orderline_df = spark.read.option('header', True).option('inferSchema', True).csv(orderline_csv)

print(f'✅ Data loaded successfully')
print(f'📊 Records: {orderline_df.count()}')
print(f'📋 Columns: {orderline_df.columns}')

# Display sample data from dataframe
print(f'📖 Sample data:')
orderline_df.show(10, truncate=False)

In [None]:
# Validate and confirm to target schema
print(f'🔍 Validating data quality...')
required_columns = [
    'OrderLineId', 'OrderId', 'ProductId', 'Quantity', 'UnitPrice', 'LineTotal', 'CreatedBy'
]
missing_columns = [c for c in required_columns if c not in orderline_df.columns]
if missing_columns:
    print(f'⚠️ Warning: Missing columns in source data: {missing_columns}')
else:
    print(f'✅ All required columns present in source data.')

from pyspark.sql import functions as F
for col_name in missing_columns:
    if col_name in ['Quantity', 'UnitPrice', 'LineTotal']:
        orderline_df = orderline_df.withColumn(col_name, F.lit(0.0))
    elif col_name == 'CreatedBy':
        orderline_df = orderline_df.withColumn(col_name, F.lit('Script'))
    else:
        orderline_df = orderline_df.withColumn(col_name, F.lit(''))
from pyspark.sql.types import StringType, DoubleType
orderline_df = orderline_df.withColumn('OrderLineId', col('OrderLineId').cast(StringType()))
orderline_df = orderline_df.withColumn('OrderId', col('OrderId').cast(StringType()))
orderline_df = orderline_df.withColumn('ProductId', col('ProductId').cast(StringType()))
orderline_df = orderline_df.withColumn('Quantity', col('Quantity').cast(DoubleType()))
orderline_df = orderline_df.withColumn('UnitPrice', col('UnitPrice').cast(DoubleType()))
orderline_df = orderline_df.withColumn('LineTotal', col('LineTotal').cast(DoubleType()))
orderline_df = orderline_df.withColumn('CreatedBy', col('CreatedBy').cast(StringType()))
orderline_df = orderline_df.select(required_columns)

print(f'📊 Data Quality Check:')
null_counts = orderline_df.select([F.sum(col(c).isNull().cast('int')).alias(c) for c in required_columns]).collect()[0]
for col_name in required_columns:
    null_count = null_counts[col_name]
    if null_count > 0:
        print(f'  {col_name}: {null_count} null values')
    else:
        print(f'  {col_name}: ✅ No nulls')

print(f'🎯 ProductId Distribution:')
orderline_df.groupBy('ProductId').count().orderBy('ProductId').show()
spark.sql(f'CREATE SCHEMA IF NOT EXISTS {schema_name}')
print(f'💾 Loading data to databricks table: {target_full_path}')

try:
    orderline_df.write \
        .format('delta') \
        .mode('overwrite') \
        .option('overwriteSchema', 'true') \
        .saveAsTable(target_full_path)
    print(f'✅ Data loaded successfully to {target_full_path}')

    result_count = spark.sql(f'SELECT COUNT(*) as count FROM {target_full_path}').collect()[0]['count']
    print(f'📊 Records in target table: {result_count}')
    print(f'\n📖 Sample from Silver table:')

    spark.sql(f'SELECT * FROM {target_full_path} ORDER BY OrderLineId').show(10, truncate=False)
    print(f'🎉 OrderLine data load complete!')
except Exception as e:
    print(f'❌ Error loading data to table: {str(e)}')
    raise