# Databricks Job Demo: Complete ETL Pipeline
## 4 Notebook Series for Automated Data Processing

### Overview
This demo creates a complete ETL pipeline with 4 connected notebooks that can be orchestrated as a Databricks Job. The pipeline simulates a real-world e-commerce data processing scenario.

---

## **Notebook 1: Data Generation and Ingestion**
**File: `01_data_ingestion.py`**

```python
# Databricks notebook source
# MAGIC %md
# MAGIC # Data Ingestion Pipeline - Step 1
# MAGIC ## Generate and Ingest Raw E-commerce Data
# MAGIC 
# MAGIC **Purpose:** Create realistic e-commerce data and store in Bronze layer
# MAGIC **Output:** Raw CSV and JSON files in Delta format
# MAGIC **Job Role:** Data Ingestion

# COMMAND ----------

# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import random
import uuid

# COMMAND ----------

# MAGIC %md
# MAGIC ## Setup and Configuration

# COMMAND ----------

# Configuration parameters (can be passed as job parameters)
dbutils.widgets.text("base_path", "/tmp/demo_pipeline", "Base Path")
dbutils.widgets.text("num_customers", "1000", "Number of Customers")
dbutils.widgets.text("num_orders", "5000", "Number of Orders")
dbutils.widgets.text("run_date", str(datetime.now().date()), "Run Date")

base_path = dbutils.widgets.get("base_path")
num_customers = int(dbutils.widgets.get("num_customers"))
num_orders = int(dbutils.widgets.get("num_orders"))
run_date = dbutils.widgets.get("run_date")

print(f"Pipeline Configuration:")
print(f"Base Path: {base_path}")
print(f"Customers: {num_customers}, Orders: {num_orders}")
print(f"Run Date: {run_date}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Create Raw Data Generators

# COMMAND ----------

def generate_customers(num_customers):
    """Generate realistic customer data"""
    
    customers = []
    cities = ["New York", "Los Angeles", "Chicago", "Houston", "Phoenix", "Philadelphia", "San Antonio", "San Diego", "Dallas", "San Jose"]
    states = ["NY", "CA", "IL", "TX", "AZ", "PA", "TX", "CA", "TX", "CA"]
    
    for i in range(num_customers):
        customer = {
            "customer_id": f"CUST_{i+1:06d}",
            "first_name": random.choice(["John", "Jane", "Mike", "Sarah", "David", "Lisa", "Chris", "Emma", "Alex", "Maria"]),
            "last_name": random.choice(["Smith", "Johnson", "Williams", "Brown", "Jones", "Garcia", "Miller", "Davis", "Rodriguez", "Martinez"]),
            "email": f"customer{i+1}@email.com",
            "phone": f"+1-{random.randint(100,999)}-{random.randint(100,999)}-{random.randint(1000,9999)}",
            "city": random.choice(cities),
            "state": random.choice(states),
            "zip_code": f"{random.randint(10000, 99999)}",
            "registration_date": (datetime.now() - timedelta(days=random.randint(1, 365))).isoformat(),
            "customer_tier": random.choice(["Bronze", "Silver", "Gold", "Platinum"]),
            "total_lifetime_value": round(random.uniform(100, 10000), 2)
        }
        customers.append(customer)
    
    return customers

def generate_products():
    """Generate product catalog"""
    
    products = [
        {"product_id": "PROD_001", "name": "Wireless Headphones", "category": "Electronics", "price": 129.99, "cost": 65.00},
        {"product_id": "PROD_002", "name": "Smart Watch", "category": "Electronics", "price": 299.99, "cost": 150.00},
        {"product_id": "PROD_003", "name": "Laptop Bag", "category": "Accessories", "price": 79.99, "cost": 25.00},
        {"product_id": "PROD_004", "name": "Bluetooth Speaker", "category": "Electronics", "price": 89.99, "cost": 35.00},
        {"product_id": "PROD_005", "name": "Phone Case", "category": "Accessories", "price": 24.99, "cost": 8.00},
        {"product_id": "PROD_006", "name": "Tablet Stand", "category": "Accessories", "price": 39.99, "cost": 12.00},
        {"product_id": "PROD_007", "name": "USB Cable", "category": "Accessories", "price": 19.99, "cost": 5.00},
        {"product_id": "PROD_008", "name": "Power Bank", "category": "Electronics", "price": 49.99, "cost": 20.00},
        {"product_id": "PROD_009", "name": "Wireless Mouse", "category": "Electronics", "price": 34.99, "cost": 15.00},
        {"product_id": "PROD_010", "name": "Keyboard", "category": "Electronics", "price": 79.99, "cost": 30.00}
    ]
    
    return products

def generate_orders(num_orders, customers, products):
    """Generate realistic order data"""
    
    orders = []
    order_items = []
    
    for i in range(num_orders):
        order_id = f"ORDER_{i+1:08d}"
        customer_id = random.choice(customers)["customer_id"]
        order_date = datetime.now() - timedelta(days=random.randint(0, 30))
        
        # Generate 1-5 items per order
        num_items = random.randint(1, 5)
        order_total = 0
        
        for item_seq in range(num_items):
            product = random.choice(products)
            quantity = random.randint(1, 3)
            item_total = product["price"] * quantity
            order_total += item_total
            
            order_items.append({
                "order_id": order_id,
                "item_sequence": item_seq + 1,
                "product_id": product["product_id"],
                "product_name": product["name"],
                "category": product["category"],
                "unit_price": product["price"],
                "quantity": quantity,
                "item_total": round(item_total, 2)
            })
        
        orders.append({
            "order_id": order_id,
            "customer_id": customer_id,
            "order_date": order_date.isoformat(),
            "order_status": random.choice(["Pending", "Processing", "Shipped", "Delivered", "Cancelled"]),
            "payment_method": random.choice(["Credit Card", "Debit Card", "PayPal", "Cash"]),
            "shipping_address": f"{random.randint(100, 999)} Main St",
            "order_total": round(order_total, 2),
            "discount_amount": round(random.uniform(0, order_total * 0.2), 2),
            "tax_amount": round(order_total * 0.08, 2),
            "created_timestamp": datetime.now().isoformat()
        })
    
    return orders, order_items

# COMMAND ----------

# MAGIC %md
# MAGIC ## Generate and Save Raw Data

# COMMAND ----------

# Generate data
print("Generating customers...")
customers_data = generate_customers(num_customers)

print("Generating products...")
products_data = generate_products()

print("Generating orders...")
orders_data, order_items_data = generate_orders(num_orders, customers_data, products_data)

print(f"Generated: {len(customers_data)} customers, {len(orders_data)} orders, {len(order_items_data)} order items")

# COMMAND ----------

# Convert to Spark DataFrames
customers_df = spark.createDataFrame(customers_data)
products_df = spark.createDataFrame(products_data)
orders_df = spark.createDataFrame(orders_data)
order_items_df = spark.createDataFrame(order_items_data)

print("DataFrames created successfully")
print(f"Customers: {customers_df.count()} rows")
print(f"Products: {products_df.count()} rows") 
print(f"Orders: {orders_df.count()} rows")
print(f"Order Items: {order_items_df.count()} rows")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Save to Bronze Layer (Raw Data)

# COMMAND ----------

# Create bronze layer paths
bronze_path = f"{base_path}/bronze"

# Save as Delta tables (Bronze layer)
print("Saving to Bronze layer...")

customers_df.write.mode("overwrite").option("path", f"{bronze_path}/customers").saveAsTable("bronze_customers")
products_df.write.mode("overwrite").option("path", f"{bronze_path}/products").saveAsTable("bronze_products")
orders_df.write.mode("overwrite").option("path", f"{bronze_path}/orders").saveAsTable("bronze_orders")
order_items_df.write.mode("overwrite").option("path", f"{bronze_path}/order_items").saveAsTable("bronze_order_items")

print("Bronze layer data saved successfully!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Quality Checks

# COMMAND ----------

# Basic data quality checks
print("=== DATA QUALITY REPORT ===")
print(f"Customers table: {spark.table('bronze_customers').count()} records")
print(f"Products table: {spark.table('bronze_products').count()} records")
print(f"Orders table: {spark.table('bronze_orders').count()} records")
print(f"Order Items table: {spark.table('bronze_order_items').count()} records")

# Check for duplicates
print(f"\\nDuplicate customers: {customers_df.count() - customers_df.dropDuplicates(['customer_id']).count()}")
print(f"Duplicate orders: {orders_df.count() - orders_df.dropDuplicates(['order_id']).count()}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Job Success Metrics

# COMMAND ----------

# Create job metrics for monitoring
job_metrics = {
    "job_name": "data_ingestion",
    "run_date": run_date,
    "records_processed": {
        "customers": customers_df.count(),
        "products": products_df.count(), 
        "orders": orders_df.count(),
        "order_items": order_items_df.count()
    },
    "execution_time": datetime.now().isoformat(),
    "status": "SUCCESS"
}

# Store metrics (could be written to a monitoring table)
print("Job Metrics:", job_metrics)

# Pass data to next notebook via temp view or widgets
dbutils.notebook.exit("SUCCESS")
```

---

## **Notebook 2: Data Cleansing and Validation**
**File: `02_data_cleansing.py`**

```python
# Databricks notebook source
# MAGIC %md
# MAGIC # Data Cleansing Pipeline - Step 2
# MAGIC ## Clean and Validate Bronze Data
# MAGIC 
# MAGIC **Purpose:** Apply data quality rules and create Silver layer
# MAGIC **Input:** Bronze layer tables
# MAGIC **Output:** Cleansed Silver tables
# MAGIC **Job Role:** Data Cleansing

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime
import re

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration and Parameters

# COMMAND ----------

# Get parameters from job
dbutils.widgets.text("base_path", "/tmp/demo_pipeline", "Base Path")
dbutils.widgets.text("data_quality_threshold", "0.95", "Data Quality Threshold")

base_path = dbutils.widgets.get("base_path")
quality_threshold = float(dbutils.widgets.get("data_quality_threshold"))

print(f"Cleansing Configuration:")
print(f"Base Path: {base_path}")
print(f"Quality Threshold: {quality_threshold}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Bronze Layer Data

# COMMAND ----------

# Read bronze tables
bronze_customers = spark.table("bronze_customers")
bronze_products = spark.table("bronze_products")
bronze_orders = spark.table("bronze_orders")
bronze_order_items = spark.table("bronze_order_items")

print("Bronze data loaded:")
print(f"Customers: {bronze_customers.count()}")
print(f"Products: {bronze_products.count()}")
print(f"Orders: {bronze_orders.count()}")
print(f"Order Items: {bronze_order_items.count()}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Cleansing Functions

# COMMAND ----------

def clean_customers(df):
    """Apply data quality rules to customers"""
    
    cleaned_df = df.filter(
        # Remove invalid data
        col("customer_id").isNotNull() &
        col("email").isNotNull() &
        col("first_name").isNotNull() &
        col("last_name").isNotNull()
    ).withColumn(
        # Standardize email format
        "email", lower(col("email"))
    ).withColumn(
        # Standardize names
        "first_name", initcap(col("first_name"))
    ).withColumn(
        "last_name", initcap(col("last_name"))
    ).withColumn(
        # Validate phone format
        "phone_cleaned", regexp_replace(col("phone"), "[^0-9]", "")
    ).withColumn(
        # Add data quality flags
        "is_valid_email", col("email").rlike("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$")
    ).withColumn(
        "is_valid_phone", length(col("phone_cleaned")) >= 10
    ).withColumn(
        # Add processed timestamp
        "processed_date", current_timestamp()
    )
    
    return cleaned_df

def clean_orders(df):
    """Apply data quality rules to orders"""
    
    cleaned_df = df.filter(
        # Remove invalid orders
        col("order_id").isNotNull() &
        col("customer_id").isNotNull() &
        col("order_total") > 0
    ).withColumn(
        # Parse order date
        "order_date_parsed", to_timestamp(col("order_date"))
    ).withColumn(
        # Calculate net amount
        "net_amount", col("order_total") - col("discount_amount") + col("tax_amount")
    ).withColumn(
        # Validate order status
        "is_valid_status", col("order_status").isin(["Pending", "Processing", "Shipped", "Delivered", "Cancelled"])
    ).withColumn(
        # Add business rules
        "is_large_order", col("order_total") > 500
    ).withColumn(
        "order_age_days", datediff(current_date(), col("order_date_parsed"))
    ).withColumn(
        "processed_date", current_timestamp()
    )
    
    return cleaned_df

def clean_order_items(df):
    """Apply data quality rules to order items"""
    
    cleaned_df = df.filter(
        # Remove invalid items
        col("order_id").isNotNull() &
        col("product_id").isNotNull() &
        col("quantity") > 0 &
        col("unit_price") > 0
    ).withColumn(
        # Validate calculated totals
        "calculated_total", round(col("unit_price") * col("quantity"), 2)
    ).withColumn(
        "total_variance", abs(col("item_total") - col("calculated_total"))
    ).withColumn(
        "is_total_accurate", col("total_variance") < 0.01
    ).withColumn(
        "processed_date", current_timestamp()
    )
    
    return cleaned_df

# COMMAND ----------

# MAGIC %md
# MAGIC ## Apply Data Cleansing

# COMMAND ----------

print("Applying data cleansing rules...")

# Clean each dataset
silver_customers = clean_customers(bronze_customers)
silver_products = bronze_products.withColumn("processed_date", current_timestamp())  # Products need minimal cleaning
silver_orders = clean_orders(bronze_orders)
silver_order_items = clean_order_items(bronze_order_items)

print("Data cleansing completed")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Quality Assessment

# COMMAND ----------

# Calculate quality metrics
def calculate_quality_metrics(original_df, cleaned_df, entity_name):
    """Calculate data quality metrics"""
    
    original_count = original_df.count()
    cleaned_count = cleaned_df.count()
    retention_rate = cleaned_count / original_count if original_count > 0 else 0
    
    print(f"\\n{entity_name} Quality Metrics:")
    print(f"  Original records: {original_count}")
    print(f"  Cleaned records: {cleaned_count}")
    print(f"  Retention rate: {retention_rate:.2%}")
    print(f"  Records removed: {original_count - cleaned_count}")
    
    return {
        "entity": entity_name,
        "original_count": original_count,
        "cleaned_count": cleaned_count,
        "retention_rate": retention_rate,
        "quality_passed": retention_rate >= quality_threshold
    }

# Calculate metrics for each entity
quality_report = []
quality_report.append(calculate_quality_metrics(bronze_customers, silver_customers, "Customers"))
quality_report.append(calculate_quality_metrics(bronze_orders, silver_orders, "Orders"))
quality_report.append(calculate_quality_metrics(bronze_order_items, silver_order_items, "Order Items"))

# COMMAND ----------

# MAGIC %md
# MAGIC ## Additional Data Quality Checks

# COMMAND ----------

# Email validation check
invalid_emails = silver_customers.filter(col("is_valid_email") == False).count()
print(f"\\nAdditional Quality Checks:")
print(f"Invalid email addresses: {invalid_emails}")

# Order total validation
order_total_issues = silver_order_items.filter(col("is_total_accurate") == False).count()
print(f"Order total calculation issues: {order_total_issues}")

# Orphaned order items (orders without customer)
orphaned_items = silver_order_items.join(
    silver_customers, 
    silver_order_items.order_id.startswith(silver_customers.customer_id), 
    "left_anti"
).count()
print(f"Orphaned order items: {orphaned_items}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Save Silver Layer Data

# COMMAND ----------

# Create silver layer path
silver_path = f"{base_path}/silver"

print("Saving to Silver layer...")

# Save cleaned data as Delta tables
silver_customers.write.mode("overwrite").option("path", f"{silver_path}/customers").saveAsTable("silver_customers")
silver_products.write.mode("overwrite").option("path", f"{silver_path}/products").saveAsTable("silver_products")
silver_orders.write.mode("overwrite").option("path", f"{silver_path}/orders").saveAsTable("silver_orders")
silver_order_items.write.mode("overwrite").option("path", f"{silver_path}/order_items").saveAsTable("silver_order_items")

print("Silver layer data saved successfully!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Quality Gate Check

# COMMAND ----------

# Determine if quality standards are met
overall_quality_passed = all([report["quality_passed"] for report in quality_report])

print(f"\\n=== QUALITY GATE ASSESSMENT ===")
print(f"Quality Threshold: {quality_threshold:.1%}")
print(f"Overall Quality Status: {'PASSED' if overall_quality_passed else 'FAILED'}")

if not overall_quality_passed:
    print("\\nQuality issues detected:")
    for report in quality_report:
        if not report["quality_passed"]:
            print(f"  - {report['entity']}: {report['retention_rate']:.1%} retention rate")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Job Output and Metrics

# COMMAND ----------

# Create comprehensive job output
job_result = {
    "job_name": "data_cleansing",
    "execution_time": datetime.now().isoformat(),
    "quality_passed": overall_quality_passed,
    "quality_report": quality_report,
    "silver_tables_created": [
        "silver_customers", 
        "silver_products", 
        "silver_orders", 
        "silver_order_items"
    ],
    "additional_checks": {
        "invalid_emails": invalid_emails,
        "order_total_issues": order_total_issues,
        "orphaned_items": orphaned_items
    }
}

print("Job execution completed")
print("Result:", job_result)

# Exit with status
dbutils.notebook.exit("SUCCESS" if overall_quality_passed else "QUALITY_FAILED")
```

---

## **Notebook 3: Data Transformation and Enrichment**
**File: `03_data_transformation.py`**

```python
# Databricks notebook source
# MAGIC %md
# MAGIC # Data Transformation Pipeline - Step 3
# MAGIC ## Transform and Enrich Silver Data for Analytics
# MAGIC 
# MAGIC **Purpose:** Create business-ready Gold layer tables
# MAGIC **Input:** Silver layer tables
# MAGIC **Output:** Aggregated and enriched Gold tables
# MAGIC **Job Role:** Data Transformation

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------

dbutils.widgets.text("base_path", "/tmp/demo_pipeline", "Base Path")
dbutils.widgets.text("analysis_date", str(datetime.now().date()), "Analysis Date")

base_path = dbutils.widgets.get("base_path")
analysis_date = dbutils.widgets.get("analysis_date")

print(f"Transformation Configuration:")
print(f"Base Path: {base_path}")
print(f"Analysis Date: {analysis_date}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Read Silver Layer Data

# COMMAND ----------

# Load silver tables
silver_customers = spark.table("silver_customers")
silver_products = spark.table("silver_products")
silver_orders = spark.table("silver_orders")
silver_order_items = spark.table("silver_order_items")

print("Silver data loaded:")
print(f"Customers: {silver_customers.count()}")
print(f"Products: {silver_products.count()}")
print(f"Orders: {silver_orders.count()}")
print(f"Order Items: {silver_order_items.count()}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Business Logic Transformations

# COMMAND ----------

# Customer analytics
def create_customer_analytics():
    """Create customer analytics table"""
    
    # Calculate customer metrics
    customer_orders = silver_orders.groupBy("customer_id").agg(
        count("order_id").alias("total_orders"),
        sum("order_total").alias("total_spent"),
        avg("order_total").alias("avg_order_value"),
        max("order_date_parsed").alias("last_order_date"),
        min("order_date_parsed").alias("first_order_date")
    )
    
    # Calculate customer segments
    customer_analytics = silver_customers.join(customer_orders, "customer_id", "left").fillna(0).withColumn(
        "customer_segment",
        when(col("total_orders") >= 10, "VIP")
        .when(col("total_orders") >= 5, "Loyal")
        .when(col("total_orders") >= 2, "Regular")
        .otherwise("New")
    ).withColumn(
        "days_since_last_order",
        datediff(current_date(), col("last_order_date"))
    ).withColumn(
        "customer_lifetime_days",
        datediff(current_date(), col("first_order_date"))
    ).withColumn(
        "is_active_customer",
        col("days_since_last_order") <= 90
    )
    
    return customer_analytics

# Product performance analytics
def create_product_analytics():
    """Create product performance table"""
    
    # Calculate product metrics
    product_sales = silver_order_items.groupBy("product_id").agg(
        count("order_id").alias("total_orders"),
        sum("quantity").alias("total_quantity_sold"),
        sum("item_total").alias("total_revenue"),
        avg("unit_price").alias("avg_selling_price"),
        countDistinct("order_id").alias("unique_orders")
    )
    
    # Add product information and rankings
    product_analytics = silver_products.join(product_sales, "product_id", "left").fillna(0).withColumn(
        "revenue_rank",
        row_number().over(Window.orderBy(desc("total_revenue")))
    ).withColumn(
        "quantity_rank", 
        row_number().over(Window.orderBy(desc("total_quantity_sold")))
    ).withColumn(
        "profit_margin",
        round((col("price") - col("cost")) / col("price") * 100, 2)
    ).withColumn(
        "performance_category",
        when(col("revenue_rank") <= 3, "Top Performer")
        .when(col("revenue_rank") <= 10, "High Performer")
        .when(col("total_revenue") > 0, "Regular Performer")
        .otherwise("No Sales")
    )
    
    return product_analytics

# Sales analytics by time period
def create_sales_analytics():
    """Create time-based sales analytics"""
    
    # Daily sales summary
    daily_sales = silver_orders.withColumn("order_date", to_date("order_date_parsed")).groupBy("order_date").agg(
        count("order_id").alias("daily_orders"),
        sum("order_total").alias("daily_revenue"),
        avg("order_total").alias("avg_order_value"),
        countDistinct("customer_id").alias("unique_customers")
    ).withColumn(
        "day_of_week", dayofweek("order_date")
    ).withColumn(
        "day_name",
        when(col("day_of_week") == 1, "Sunday")
        .when(col("day_of_week") == 2, "Monday")
        .when(col("day_of_week") == 3, "Tuesday")
        .when(col("day_of_week") == 4, "Wednesday")
        .when(col("day_of_week") == 5, "Thursday")
        .when(col("day_of_week") == 6, "Friday")
        .when(col("day_of_week") == 7, "Saturday")
    )
    
    # Add moving averages
    window_spec = Window.orderBy("order_date").rowsBetween(-6, 0)
    
    sales_analytics = daily_sales.withColumn(
        "seven_day_avg_revenue",
        round(avg("daily_revenue").over(window_spec), 2)
    ).withColumn(
        "seven_day_avg_orders",
        round(avg("daily_orders").over(window_spec), 2)
    )
    
    return sales_analytics

# Category performance
def create_category_analytics():
    """Create category performance analytics"""
    
    category_performance = silver_order_items.join(
        silver_products.select("product_id", "category"), "product_id"
    ).groupBy("category").agg(
        sum("item_total").alias("category_revenue"),
        sum("quantity").alias("category_quantity"),
        count("order_id").alias("category_orders"),
        countDistinct("order_id").alias("unique_orders"),
        avg("unit_price").alias("avg_price")
    ).withColumn(
        "revenue_percentage",
        round(col("category_revenue") / sum("category_revenue").over(Window.partitionBy()) * 100, 2)
    )
    
    return category_performance

# COMMAND ----------

# MAGIC %md
# MAGIC ## Execute Transformations

# COMMAND ----------

print("Creating business analytics tables...")

# Create all analytics tables
gold_customer_analytics = create_customer_analytics()
gold_product_analytics = create_product_analytics()
gold_sales_analytics = create_sales_analytics()
gold_category_analytics = create_category_analytics()

print("Transformations completed")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Enrichment

# COMMAND ----------

# Create a comprehensive order fact table
def create_order_fact():
    """Create enriched order fact table"""
    
    order_fact = silver_orders.alias("o").join(
        silver_customers.select("customer_id", "customer_tier", "state").alias("c"), "customer_id"
    ).join(
        silver_order_items.groupBy("order_id").agg(
            sum("quantity").alias("total_items"),
            countDistinct("product_id").alias("unique_products")
        ).alias("oi"), "order_id"
    ).select(
        col("o.order_id"),
        col("o.customer_id"),
        col("c.customer_tier"),
        col("c.state"),
        col("o.order_date_parsed").alias("order_date"),
        col("o.order_status"),
        col("o.payment_method"),
        col("o.order_total"),
        col("o.discount_amount"),
        col("o.tax_amount"),
        col("o.net_amount"),
        col("oi.total_items"),
        col("oi.unique_products"),
        col("o.is_large_order"),
        col("o.order_age_days")
    ).withColumn(
        "order_complexity",
        when(col("unique_products") >= 5, "Complex")
        .when(col("unique_products") >= 3, "Moderate")
        .otherwise("Simple")
    ).withColumn(
        "discount_percentage",
        round(col("discount_amount") / col("order_total") * 100, 2)
    )
    
    return order_fact

gold_order_fact = create_order_fact()

print("Order fact table created")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Save Gold Layer Tables

# COMMAND ----------

# Create gold layer path
gold_path = f"{base_path}/gold"

print("Saving to Gold layer...")

# Save all gold tables
gold_customer_analytics.write.mode("overwrite").option("path", f"{gold_path}/customer_analytics").saveAsTable("gold_customer_analytics")
gold_product_analytics.write.mode("overwrite").option("path", f"{gold_path}/product_analytics").saveAsTable("gold_product_analytics")
gold_sales_analytics.write.mode("overwrite").option("path", f"{gold_path}/sales_analytics").saveAsTable("gold_sales_analytics")
gold_category_analytics.write.mode("overwrite").option("path", f"{gold_path}/category_analytics").saveAsTable("gold_category_analytics")
gold_order_fact.write.mode("overwrite").option("path", f"{gold_path}/order_fact").saveAsTable("gold_order_fact")

print("Gold layer tables saved successfully!")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Business Insights Summary

# COMMAND ----------

# Generate business insights
print("=== BUSINESS INSIGHTS SUMMARY ===")

# Customer insights
total_customers = gold_customer_analytics.count()
vip_customers = gold_customer_analytics.filter(col("customer_segment") == "VIP").count()
active_customers = gold_customer_analytics.filter(col("is_active_customer") == True).count()

print(f"\\nCustomer Analysis:")
print(f"  Total customers: {total_customers}")
print(f"  VIP customers: {vip_customers} ({vip_customers/total_customers:.1%})")
print(f"  Active customers: {active_customers} ({active_customers/total_customers:.1%})")

# Product insights
top_product = gold_product_analytics.orderBy(desc("total_revenue")).first()
print(f"\\nProduct Analysis:")
print(f"  Top revenue product: {top_product['name']} (${top_product['total_revenue']:,.2f})")

# Sales insights
total_revenue = gold_sales_analytics.agg(sum("daily_revenue")).collect()[0][0]
avg_daily_revenue = gold_sales_analytics.agg(avg("daily_revenue")).collect()[0][0]

print(f"\\nSales Analysis:")
print(f"  Total revenue: ${total_revenue:,.2f}")
print(f"  Average daily revenue: ${avg_daily_revenue:,.2f}")

# Category insights
top_category = gold_category_analytics.orderBy(desc("category_revenue")).first()
print(f"\\nCategory Analysis:")
print(f"  Top category: {top_category['category']} (${top_category['category_revenue']:,.2f})")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Job Completion

# COMMAND ----------

# Create final job metrics
transformation_metrics = {
    "job_name": "data_transformation",
    "execution_time": datetime.now().isoformat(),
    "gold_tables_created": [
        "gold_customer_analytics",
        "gold_product_analytics", 
        "gold_sales_analytics",
        "gold_category_analytics",
        "gold_order_fact"
    ],
    "business_metrics": {
        "total_customers": total_customers,
        "vip_customers": vip_customers,
        "active_customers": active_customers,
        "total_revenue": float(total_revenue),
        "avg_daily_revenue": float(avg_daily_revenue)
    },
    "status": "SUCCESS"
}

print("Transformation job completed successfully")
print("Metrics:", transformation_metrics)

dbutils.notebook.exit("SUCCESS")
```

---

## **Notebook 4: Data Quality Monitoring and Reporting**
**File: `04_data_monitoring.py`**

```python
# Databricks notebook source
# MAGIC %md
# MAGIC # Data Monitoring and Reporting - Step 4
# MAGIC ## Monitor Pipeline Health and Generate Reports
# MAGIC 
# MAGIC **Purpose:** Monitor data quality and generate business reports
# MAGIC **Input:** All layer tables (Bronze, Silver, Gold)
# MAGIC **Output:** Quality reports and business dashboards
# MAGIC **Job Role:** Data Monitoring & Reporting

# COMMAND ----------

from pyspark.sql.functions import *
from pyspark.sql.types import *
from datetime import datetime, timedelta
import json

# COMMAND ----------

# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------

dbutils.widgets.text("base_path", "/tmp/demo_pipeline", "Base Path")
dbutils.widgets.text("report_date", str(datetime.now().date()), "Report Date")
dbutils.widgets.text("alert_threshold", "0.9", "Alert Threshold")

base_path = dbutils.widgets.get("base_path")
report_date = dbutils.widgets.get("report_date")
alert_threshold = float(dbutils.widgets.get("alert_threshold"))

print(f"Monitoring Configuration:")
print(f"Base Path: {base_path}")
print(f"Report Date: {report_date}")
print(f"Alert Threshold: {alert_threshold}")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Quality Monitoring

# COMMAND ----------

def check_table_health(table_name, expected_min_rows=0):
    """Check basic table health metrics"""
    
    try:
        df = spark.table(table_name)
        row_count = df.count()
        
        # Check for null values in key columns
        total_columns = len(df.columns)
        null_checks = []
        
        for col_name in df.columns:
            null_count = df.filter(col(col_name).isNull()).count()
            null_percentage = null_count / row_count if row_count > 0 else 0
            null_checks.append({
                "column": col_name,
                "null_count": null_count,
                "null_percentage": null_percentage
            })
        
        # Overall health score
        avg_null_percentage = sum([check["null_percentage"] for check in null_checks]) / total_columns
        health_score = 1 - avg_null_percentage
        
        return {
            "table_name": table_name,
            "row_count": row_count,
            "column_count": total_columns,
            "health_score": health_score,
            "meets_min_rows": row_count >= expected_min_rows,
            "null_analysis": null_checks,
            "status": "HEALTHY" if health_score >= alert_threshold and row_count >= expected_min_rows else "UNHEALTHY"
        }
    
    except Exception as e:
        return {
            "table_name": table_name,
            "error": str(e),
            "status": "ERROR"
        }

# COMMAND ----------

print("Checking table health across all layers...")

# Check all tables
table_health_report = []

# Bronze layer
bronze_tables = ["bronze_customers", "bronze_products", "bronze_orders", "bronze_order_items"]
for table in bronze_tables:
    table_health_report.append(check_table_health(table, expected_min_rows=1))

# Silver layer
silver_tables = ["silver_customers", "silver_products", "silver_orders", "silver_order_items"]  
for table in silver_tables:
    table_health_report.append(check_table_health(table, expected_min_rows=1))

# Gold layer
gold_tables = ["gold_customer_analytics", "gold_product_analytics", "gold_sales_analytics", "gold_category_analytics", "gold_order_fact"]
for table in gold_tables:
    table_health_report.append(check_table_health(table, expected_min_rows=1))

print("Table health checks completed")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Data Quality Report

# COMMAND ----------

# Generate comprehensive quality report
print("=== DATA QUALITY REPORT ===")
print(f"Report Date: {report_date}")
print(f"Report Time: {datetime.now()}")

healthy_tables = [report for report in table_health_report if report.get("status") == "HEALTHY"]
unhealthy_tables = [report for report in table_health_report if report.get("status") == "UNHEALTHY"]
error_tables = [report for report in table_health_report if report.get("status") == "ERROR"]

print(f"\\nOverall Status:")
print(f"  Healthy tables: {len(healthy_tables)}")
print(f"  Unhealthy tables: {len(unhealthy_tables)}")
print(f"  Error tables: {len(error_tables)}")
print(f"  Total tables: {len(table_health_report)}")

# Detailed report for each layer
for layer in ["bronze", "silver", "gold"]:
    layer_tables = [report for report in table_health_report if report["table_name"].startswith(layer)]
    layer_health = sum([report.get("health_score", 0) for report in layer_tables]) / len(layer_tables) if layer_tables else 0
    
    print(f"\\n{layer.upper()} Layer Health: {layer_health:.2%}")
    for table_report in layer_tables:
        status_icon = "✅" if table_report.get("status") == "HEALTHY" else "❌"
        print(f"  {status_icon} {table_report['table_name']}: {table_report.get('row_count', 'N/A')} rows, {table_report.get('health_score', 0):.2%} health")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Business Metrics Dashboard

# COMMAND ----------

def generate_business_dashboard():
    """Generate key business metrics"""
    
    print("\\n=== BUSINESS DASHBOARD ===")
    
    try:
        # Customer metrics
        customer_metrics = spark.table("gold_customer_analytics")
        total_customers = customer_metrics.count()
        active_customers = customer_metrics.filter(col("is_active_customer") == True).count()
        vip_customers = customer_metrics.filter(col("customer_segment") == "VIP").count()
        
        print(f"\\n📊 Customer Metrics:")
        print(f"   Total Customers: {total_customers:,}")
        print(f"   Active Customers: {active_customers:,} ({active_customers/total_customers:.1%})")
        print(f"   VIP Customers: {vip_customers:,} ({vip_customers/total_customers:.1%})")
        
        # Sales metrics
        sales_metrics = spark.table("gold_sales_analytics")
        total_revenue = sales_metrics.agg(sum("daily_revenue")).collect()[0][0] or 0
        total_orders = sales_metrics.agg(sum("daily_orders")).collect()[0][0] or 0
        avg_order_value = total_revenue / total_orders if total_orders > 0 else 0
        
        print(f"\\n💰 Sales Metrics:")
        print(f"   Total Revenue: ${total_revenue:,.2f}")
        print(f"   Total Orders: {total_orders:,}")
        print(f"   Average Order Value: ${avg_order_value:.2f}")
        
        # Product metrics
        product_metrics = spark.table("gold_product_analytics")
        total_products = product_metrics.count()
        top_performer_count = product_metrics.filter(col("performance_category") == "Top Performer").count()
        no_sales_count = product_metrics.filter(col("performance_category") == "No Sales").count()
        
        print(f"\\n🛍️ Product Metrics:")
        print(f"   Total Products: {total_products:,}")
        print(f"   Top Performers: {top_performer_count:,}")
        print(f"   Products with No Sales: {no_sales_count:,}")
        
        # Category metrics
        category_metrics = spark.table("gold_category_analytics")
        top_category = category_metrics.orderBy(desc("category_revenue")).first()
        
        print(f"\\n📦 Category Metrics:")
        print(f"   Top Category: {top_category['category']}")
        print(f"   Top Category Revenue: ${top_category['category_revenue']:,.2f}")
        print(f"   Total Categories: {category_metrics.count()}")
        
        return {
            "customers": {
                "total": total_customers,
                "active": active_customers,
                "vip": vip_customers
            },
            "sales": {
                "total_revenue": float(total_revenue),
                "total_orders": int(total_orders),
                "avg_order_value": float(avg_order_value)
            },
            "products": {
                "total": total_products,
                "top_performers": top_performer_count,
                "no_sales": no_sales_count
            },
            "categories": {
                "top_category": top_category['category'],
                "top_revenue": float(top_category['category_revenue'])
            }
        }
        
    except Exception as e:
        print(f"Error generating dashboard: {str(e)}")
        return {"error": str(e)}

business_metrics = generate_business_dashboard()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Anomaly Detection

# COMMAND ----------

def detect_anomalies():
    """Detect potential data anomalies"""
    
    print("\\n=== ANOMALY DETECTION ===")
    anomalies = []
    
    try:
        # Check for unusual order patterns
        order_fact = spark.table("gold_order_fact")
        
        # Detect very high order values (outliers)
        avg_order_value = order_fact.agg(avg("order_total")).collect()[0][0]
        high_value_threshold = avg_order_value * 5
        high_value_orders = order_fact.filter(col("order_total") > high_value_threshold).count()
        
        if high_value_orders > 0:
            anomalies.append(f"Found {high_value_orders} orders with unusually high values (>${high_value_threshold:.2f})")
        
        # Check for customers with unusual activity
        customer_analytics = spark.table("gold_customer_analytics")
        avg_orders_per_customer = customer_analytics.agg(avg("total_orders")).collect()[0][0]
        high_activity_threshold = avg_orders_per_customer * 10
        high_activity_customers = customer_analytics.filter(col("total_orders") > high_activity_threshold).count()
        
        if high_activity_customers > 0:
            anomalies.append(f"Found {high_activity_customers} customers with unusually high activity (>{high_activity_threshold:.0f} orders)")
        
        # Check for data freshness
        sales_analytics = spark.table("gold_sales_analytics")
        latest_date = sales_analytics.agg(max("order_date")).collect()[0][0]
        days_since_latest = (datetime.now().date() - latest_date).days if latest_date else 999
        
        if days_since_latest > 7:
            anomalies.append(f"Data may be stale - latest order date is {days_since_latest} days old")
        
        # Check for zero revenue days
        zero_revenue_days = sales_analytics.filter(col("daily_revenue") == 0).count()
        if zero_revenue_days > 0:
            anomalies.append(f"Found {zero_revenue_days} days with zero revenue")
        
        if anomalies:
            print("⚠️ Anomalies detected:")
            for anomaly in anomalies:
                print(f"   - {anomaly}")
        else:
            print("✅ No anomalies detected")
            
        return anomalies
        
    except Exception as e:
        error_msg = f"Error in anomaly detection: {str(e)}"
        print(f"❌ {error_msg}")
        return [error_msg]

detected_anomalies = detect_anomalies()

# COMMAND ----------

# MAGIC %md
# MAGIC ## Pipeline Health Summary

# COMMAND ----------

# Generate overall pipeline health
overall_health_score = sum([report.get("health_score", 0) for report in table_health_report if "health_score" in report]) / len([report for report in table_health_report if "health_score" in report])
pipeline_status = "HEALTHY" if overall_health_score >= alert_threshold and len(error_tables) == 0 else "UNHEALTHY"

print("\\n=== PIPELINE HEALTH SUMMARY ===")
print(f"Overall Health Score: {overall_health_score:.2%}")
print(f"Pipeline Status: {pipeline_status}")
print(f"Tables Status: {len(healthy_tables)}/{len(table_health_report)} healthy")

if detected_anomalies:
    print(f"Anomalies: {len(detected_anomalies)} detected")
else:
    print("Anomalies: None detected")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Save Monitoring Results

# COMMAND ----------

# Create monitoring report
monitoring_report = {
    "report_metadata": {
        "report_date": report_date,
        "report_timestamp": datetime.now().isoformat(),
        "pipeline_status": pipeline_status,
        "overall_health_score": overall_health_score
    },
    "table_health": table_health_report,
    "business_metrics": business_metrics,
    "anomalies": detected_anomalies,
    "summary": {
        "healthy_tables": len(healthy_tables),
        "unhealthy_tables": len(unhealthy_tables),
        "error_tables": len(error_tables),
        "total_tables": len(table_health_report)
    }
}

# Save monitoring report as JSON (could be saved to a Delta table for historical tracking)
monitoring_path = f"{base_path}/monitoring"
report_df = spark.createDataFrame([monitoring_report])

# In a real scenario, you would append to a monitoring table for historical tracking
print("\\nSaving monitoring report...")
print("Report saved to monitoring layer")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Alerting Logic

# COMMAND ----------

# Generate alerts based on conditions
alerts = []

if pipeline_status == "UNHEALTHY":
    alerts.append("Pipeline health is below threshold")

if len(error_tables) > 0:
    alerts.append(f"{len(error_tables)} tables have errors")

if len(detected_anomalies) > 0:
    alerts.append(f"{len(detected_anomalies)} anomalies detected")

if business_metrics.get("sales", {}).get("total_revenue", 0) == 0:
    alerts.append("Zero revenue detected")

print("\\n=== ALERTS ===")
if alerts:
    print("🚨 Active alerts:")
    for alert in alerts:
        print(f"   - {alert}")
else:
    print("✅ No alerts")

# COMMAND ----------

# MAGIC %md
# MAGIC ## Job Completion

# COMMAND ----------

# Create final monitoring output
monitoring_result = {
    "job_name": "data_monitoring",
    "execution_time": datetime.now().isoformat(),
    "pipeline_status": pipeline_status,
    "overall_health_score": overall_health_score,
    "alerts": alerts,
    "summary": {
        "tables_monitored": len(table_health_report),
        "healthy_tables": len(healthy_tables),
        "anomalies_detected": len(detected_anomalies)
    }
}

print("\\n=== MONITORING JOB COMPLETED ===")
print("Status:", pipeline_status)
print("Health Score:", f"{overall_health_score:.2%}")
print("Result:", monitoring_result)

# Exit with appropriate status
exit_status = "SUCCESS" if pipeline_status == "HEALTHY" else "ALERTS_DETECTED"
dbutils.notebook.exit(exit_status)
```

---

## **How to Set Up the Databricks Job**

### **Step 1: Upload Notebooks**
1. Create a folder in Databricks workspace: `/Shared/ETL_Pipeline_Demo`
2. Upload all 4 notebook files to this folder

### **Step 2: Create the Job**
1. Go to **Workflows** → **Jobs** → **Create Job**
2. Configure job with these tasks:
3. Jobs json is available as _Demojob.json_ file.

#### **Task Configuration:**
```json
{
  "job_name": "ETL_Pipeline_Demo",
  "tasks": [
    {
      "task_key": "data_ingestion",
      "notebook_path": "/Shared/ETL_Pipeline_Demo/01_data_ingestion",
      "cluster_spec": "job_cluster",
      "timeout_seconds": 3600
    },
    {
      "task_key": "data_cleansing", 
      "notebook_path": "/Shared/ETL_Pipeline_Demo/02_data_cleansing",
      "depends_on": ["data_ingestion"],
      "cluster_spec": "job_cluster"
    },
    {
      "task_key": "data_transformation",
      "notebook_path": "/Shared/ETL_Pipeline_Demo/03_data_transformation", 
      "depends_on": ["data_cleansing"],
      "cluster_spec": "job_cluster"
    },
    {
      "task_key": "data_monitoring",
      "notebook_path": "/Shared/ETL_Pipeline_Demo/04_data_monitoring",
      "depends_on": ["data_transformation"],
      "cluster_spec": "job_cluster"
    }
  ],
  "schedule": {
    "cron_expression": "0 2 * * *",
    "timezone": "UTC"
  }
}
```
#### Configure job using Databicks cli
You can also configure jobs using databricks cli. Steps for using cli
- Install Databricks cli on you laptop
- Get the CLI token and your databricks workspace URL
- Configure your databricks token 
```
databricks configure --token
```
- Test the cli
```
databricks workspace ls /
```
- Get the clusters ids of your workspace
```
databricks clusters list
```
- Update the cluster id in json file
- Use the below command to create the job
```
databricks jobs create --json '{
  "name": "Demo-ETL-Pipeline",
  "email_notifications": {},
  .... Full json file
  '
```

### **Step 3: Demo Features**
- **Data Lineage**: Bronze → Silver → Gold progression
- **Error Handling**: Quality gates and validation
- **Monitoring**: Health checks and anomaly detection
- **Alerting**: Automated status reporting
- **Parameterization**: Configurable via widgets
- **Dependencies**: Tasks run in sequence with proper error handling

This complete pipeline demonstrates all key Databricks Jobs features and provides a realistic ETL scenario for your training demo.