# E-Commerce Medallion Pipeline
## Bronze → Silver → Gold

This notebook processes raw e-commerce data through the Medallion architecture:
- **Bronze**: Raw data ingestion (as-is from source)
- **Silver**: Cleaned and standardized data
- **Gold**: Joined table for analytics (Customer 360° View)

---
# SETUP

In [None]:
import pandas as pd
import numpy as np
import re
import os

# Configuration
BASE_PATH = "/Volumes/SSD-CRUCIAL/Medallion/Local_Lakehouse_2"
BRONZE = f"{BASE_PATH}/bronze"
SILVER = f"{BASE_PATH}/silver"
GOLD = f"{BASE_PATH}/gold"

# Create directories
os.makedirs(BRONZE, exist_ok=True)
os.makedirs(SILVER, exist_ok=True)
os.makedirs(GOLD, exist_ok=True)

print("Directories ready!")

---
# HELPER FUNCTIONS

In [None]:
def clean_date(date_str):
    """
    Parse multiple date formats into standardized YYYY-MM-DD.
    Handles: 2022/07/15, 15-07-2022, 20220719, etc.
    """
    if pd.isna(date_str) or str(date_str).lower() in ['nan', 'n/a', 'na', '']:
        return ''
    date_str = str(date_str).strip()
    
    formats = [
        '%Y-%m-%d', '%Y/%m/%d', '%d-%m-%Y', '%d/%m/%Y', '%Y%m%d'
    ]
    for fmt in formats:
        try:
            return pd.to_datetime(date_str, format=fmt).strftime('%Y-%m-%d')
        except:
            continue
    try:
        return pd.to_datetime(date_str).strftime('%Y-%m-%d')
    except:
        return ''


def validate_email(email):
    """
    Validate email format. Returns lowercase email if valid, None if invalid.
    """
    if pd.isna(email):
        return None
    email = str(email).lower().strip()
    pattern = r'^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}$'
    if re.match(pattern, email):
        return email
    return None


def clean_gender(g):
    """
    Standardize gender to Male/Female.
    """
    if pd.isna(g):
        return None
    g = str(g).lower().strip()
    if g in ['m', 'male']:
        return 'Male'
    if g in ['f', 'female']:
        return 'Female'
    return None


print("Helper functions loaded!")

---
# BRONZE LAYER - Raw Data
Copy raw files to bronze folder (no transformations)

In [None]:
# Source path for raw data
SOURCE_PATH = "/Volumes/SSD-CRUCIAL/Medallion/DATASET-2"

# Copy raw files to bronze
import shutil

files = ['customers.csv', 'orders.csv', 'payments.csv', 'support_tickets.csv', 'web_activities.csv']

for f in files:
    shutil.copy(f"{SOURCE_PATH}/{f}", f"{BRONZE}/{f}")
    print(f"✓ Copied {f} to bronze/")

print("\nBronze layer complete!")

---
# SILVER LAYER - Data Cleaning

## 1. Clean customers.csv

In [None]:
# Read raw data
df = pd.read_csv(f"{BRONZE}/customers.csv")
print("=== RAW CUSTOMERS ===")
print(df.to_string())

In [None]:
# Clean customers
df = pd.read_csv(f"{BRONZE}/customers.csv")

# 1. Rename columns
df = df.rename(columns={
    'customer_id': 'CustomerID',
    'name': 'Name',
    'EMAIL': 'Email',
    'gender': 'Gender',
    'dob': 'DOB',
    'location': 'City'
})

# 2. Clean Name (Title Case)
df['Name'] = df['Name'].apply(
    lambda x: str(x).strip().title() if pd.notna(x) else None
)

# 3. Validate Email
df['Email'] = df['Email'].apply(validate_email)

# 4. Standardize Gender (Male/Female)
df['Gender'] = df['Gender'].apply(clean_gender)

# 5. Standardize DOB (YYYY-MM-DD)
def clean_dob(dob):
    if pd.isna(dob):
        return None
    dob_str = str(dob).strip()
    if dob_str.lower() in ['not available', 'n/a', 'na', '']:
        return None
    result = clean_date(dob_str)
    return result if result else None

df['DOB'] = df['DOB'].apply(clean_dob)

# 6. Clean City (Title Case)
df['City'] = df['City'].apply(
    lambda x: str(x).strip().title() if pd.notna(x) else None
)

# Save to silver
df.to_csv(f"{SILVER}/customers_clean.csv", index=False)

print("=== CLEANED CUSTOMERS ===")
print(df.to_string())
print(f"\n✓ Saved to: silver/customers_clean.csv")

## 2. Clean orders.csv

In [None]:
# Read raw data
df = pd.read_csv(f"{BRONZE}/orders.csv")
print("=== RAW ORDERS ===")
print(df.to_string())

In [None]:
# Clean orders
df = pd.read_csv(f"{BRONZE}/orders.csv")

# 1. Rename columns
df = df.rename(columns={
    'order_id': 'OrderID',
    'customer_id': 'CustomerID',
    'order_date': 'OrderDate',
    'amount': 'Amount',
    'status': 'Status'
})

# 2. Standardize OrderDate (YYYY-MM-DD)
df['OrderDate'] = df['OrderDate'].apply(clean_date)

# 3. Clean Amount (plain number, 2 decimals, NaN → 0.00)
df['Amount'] = df['Amount'].fillna(0.0).round(2)

# 4. Standardize Status (Title Case)
df['Status'] = df['Status'].apply(
    lambda x: str(x).strip().title() if pd.notna(x) else None
)

# Save to silver
df.to_csv(f"{SILVER}/orders_clean.csv", index=False)

print("=== CLEANED ORDERS ===")
print(df.to_string())
print(f"\n✓ Saved to: silver/orders_clean.csv")

## 3. Clean payments.csv

In [None]:
# Read raw data
df = pd.read_csv(f"{BRONZE}/payments.csv")
print("=== RAW PAYMENTS ===")
print(df.to_string())

In [None]:
# Clean payments
df = pd.read_csv(f"{BRONZE}/payments.csv")

# 1. Rename columns
df = df.rename(columns={
    'payment_id': 'PaymentID',
    'customer_id': 'CustomerID',
    'payment_date': 'PaymentDate',
    'payment_method': 'PaymentMethod',
    'payment_status': 'PaymentStatus',
    'amount': 'Amount'
})

# 2. Standardize PaymentDate (YYYY-MM-DD), NaN → blank
df['PaymentDate'] = df['PaymentDate'].apply(clean_date)

# 3. Standardize PaymentMethod (Title Case, merge creditcard → Credit Card)
def clean_payment_method(method):
    if pd.isna(method):
        return None
    method = str(method).strip().lower()
    if method in ['creditcard', 'credit card']:
        return 'Credit Card'
    return method.title()

df['PaymentMethod'] = df['PaymentMethod'].apply(clean_payment_method)

# 4. Standardize PaymentStatus (Title Case)
df['PaymentStatus'] = df['PaymentStatus'].apply(
    lambda x: str(x).strip().title() if pd.notna(x) else None
)

# 5. Amount (2 decimal places, NaN → 0.00)
df['Amount'] = df['Amount'].fillna(0.0).round(2)

# Save to silver
df.to_csv(f"{SILVER}/payments_clean.csv", index=False)

print("=== CLEANED PAYMENTS ===")
print(df.to_string())
print(f"\n✓ Saved to: silver/payments_clean.csv")

## 4. Clean support_tickets.csv

In [None]:
# Read raw data
df = pd.read_csv(f"{BRONZE}/support_tickets.csv")
print("=== RAW SUPPORT TICKETS ===")
print(df.to_string())

In [None]:
# Clean support_tickets
df = pd.read_csv(f"{BRONZE}/support_tickets.csv")

# 1. Rename columns
df = df.rename(columns={
    'ticket_id': 'TicketID',
    'customer_id': 'CustomerID',
    'issue_type': 'IssueType',
    'ticket_date': 'TicketDate',
    'resolution_status': 'ResolutionStatus'
})

# 2. Standardize TicketDate (YYYY-MM-DD), NaN → blank
df['TicketDate'] = df['TicketDate'].apply(clean_date)

# 3. Standardize IssueType (Title Case), NaN → blank
def clean_issue_type(issue):
    if pd.isna(issue) or str(issue).lower() in ['nan', 'na', '']:
        return ''
    return str(issue).strip().title()

df['IssueType'] = df['IssueType'].apply(clean_issue_type)

# 4. Standardize ResolutionStatus (Title Case), NaN → blank
def clean_status(status):
    if pd.isna(status) or str(status).lower() in ['nan', 'na', '']:
        return ''
    return str(status).strip().title()

df['ResolutionStatus'] = df['ResolutionStatus'].apply(clean_status)

# Save to silver
df.to_csv(f"{SILVER}/support_tickets_clean.csv", index=False)

print("=== CLEANED SUPPORT TICKETS ===")
print(df.to_string())
print(f"\n✓ Saved to: silver/support_tickets_clean.csv")

## 5. Clean web_activities.csv

In [None]:
# Read raw data
df = pd.read_csv(f"{BRONZE}/web_activities.csv")
print("=== RAW WEB ACTIVITIES ===")
print(df.to_string())

In [None]:
# Clean web_activities
df = pd.read_csv(f"{BRONZE}/web_activities.csv")

# 1. Rename columns
df = df.rename(columns={
    'session_id': 'SessionID',
    'customer_id': 'CustomerID',
    'page_viewed': 'PageViewed',
    'session_time': 'SessionTime',
    'device_type': 'DeviceType'
})

# 2. Standardize SessionTime (YYYY-MM-DD)
df['SessionTime'] = df['SessionTime'].apply(clean_date)

# 3. Standardize PageViewed (lowercase)
df['PageViewed'] = df['PageViewed'].apply(
    lambda x: str(x).strip().lower() if pd.notna(x) else ''
)

# 4. Standardize DeviceType (Title Case)
df['DeviceType'] = df['DeviceType'].apply(
    lambda x: str(x).strip().title() if pd.notna(x) else ''
)

# Save to silver
df.to_csv(f"{SILVER}/web_activities_clean.csv", index=False)

print("=== CLEANED WEB ACTIVITIES ===")
print(df.to_string())
print(f"\n✓ Saved to: silver/web_activities_clean.csv")

---
# GOLD LAYER - Join Tables
Create Customer 360° View by joining all Silver tables

In [None]:
# Load all Silver tables
customers = pd.read_csv(f"{SILVER}/customers_clean.csv")
orders = pd.read_csv(f"{SILVER}/orders_clean.csv")
payments = pd.read_csv(f"{SILVER}/payments_clean.csv")
tickets = pd.read_csv(f"{SILVER}/support_tickets_clean.csv")
web = pd.read_csv(f"{SILVER}/web_activities_clean.csv")

print("=== SILVER TABLES ===")
print(f"Customers: {len(customers)} rows")
print(f"Orders: {len(orders)} rows")
print(f"Payments: {len(payments)} rows")
print(f"Support Tickets: {len(tickets)} rows")
print(f"Web Activities: {len(web)} rows")

In [None]:
# Rename columns to avoid conflicts
orders = orders.rename(columns={'Amount': 'OrderAmount', 'Status': 'OrderStatus'})
payments = payments.rename(columns={'Amount': 'PaymentAmount'})

# LEFT JOIN all tables on CustomerID
# Why LEFT JOIN? Keep all customers even if they have missing data in other tables

gold = customers.merge(orders, on='CustomerID', how='left')
gold = gold.merge(payments, on='CustomerID', how='left')
gold = gold.merge(tickets, on='CustomerID', how='left')
gold = gold.merge(web, on='CustomerID', how='left')

print(f"\n=== GOLD TABLE ===")
print(f"Rows: {len(gold)}")
print(f"Columns ({len(gold.columns)}): {list(gold.columns)}")

In [None]:
# Preview Gold table
print("=== GOLD TABLE PREVIEW ===")
print(gold.to_string())

In [None]:
# Save to Gold
gold.to_csv(f"{GOLD}/gold_joined.csv", index=False)

print(f"✓ Saved to: gold/gold_joined.csv")
print(f"\nGold table ready for Power BI!")

---
# SUMMARY

In [None]:
print("="*70)
print("MEDALLION PIPELINE COMPLETE!")
print("="*70)
print(f"""
Bronze Layer (Raw):
  - customers.csv
  - orders.csv
  - payments.csv
  - support_tickets.csv
  - web_activities.csv

Silver Layer (Cleaned):
  - customers_clean.csv
  - orders_clean.csv
  - payments_clean.csv
  - support_tickets_clean.csv
  - web_activities_clean.csv

Gold Layer (Joined):
  - gold_joined.csv (Customer 360° View)

Next Steps:
  1. Upload gold_joined.csv to Fabric Lakehouse
  2. Create Semantic Model
  3. Build Power BI Dashboard with 5 KPIs:
     - Revenue by Payment Method (Pie Chart)
     - Order Status Distribution (Bar Chart)
     - Support Tickets by Issue Type (Bar Chart)
     - Customer Activity by Device (Pie Chart)
     - Revenue by City (Bar Chart / Map)
""")

---
# PYSPARK VERSION (For Microsoft Fabric)
Copy the code below into a Fabric Notebook

In [None]:
# =============================================================================
# PYSPARK VERSION FOR FABRIC - Copy this entire cell to Fabric Notebook
# =============================================================================

pyspark_code = '''
from pyspark.sql.functions import *
from pyspark.sql.types import *

# Configuration
LAKEHOUSE_PATH = "Files/"
BRONZE = f"{LAKEHOUSE_PATH}bronze/"
SILVER = f"{LAKEHOUSE_PATH}silver/"
GOLD = f"{LAKEHOUSE_PATH}gold/"

# =============================================================================
# BRONZE - Read Raw Data
# =============================================================================
df_customers_raw = spark.read.option("header", "true").csv(f"{BRONZE}customers.csv")
df_orders_raw = spark.read.option("header", "true").csv(f"{BRONZE}orders.csv")
df_payments_raw = spark.read.option("header", "true").csv(f"{BRONZE}payments.csv")
df_tickets_raw = spark.read.option("header", "true").csv(f"{BRONZE}support_tickets.csv")
df_web_raw = spark.read.option("header", "true").csv(f"{BRONZE}web_activities.csv")

# =============================================================================
# SILVER - Clean Customers
# =============================================================================
df_customers = (
    df_customers_raw
    .withColumnRenamed("customer_id", "CustomerID")
    .withColumnRenamed("name", "Name")
    .withColumnRenamed("EMAIL", "Email")
    .withColumnRenamed("gender", "Gender")
    .withColumnRenamed("dob", "DOB")
    .withColumnRenamed("location", "City")
    .withColumn("Name", initcap(trim(col("Name"))))
    .withColumn("Email", lower(trim(col("Email"))))
    .withColumn("Email", when(col("Email").rlike(r"^[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\\.[A-Za-z]{2,}$"), col("Email")).otherwise(lit(None)))
    .withColumn("Gender", when(lower(col("Gender")).isin("m", "male"), lit("Male")).when(lower(col("Gender")).isin("f", "female"), lit("Female")).otherwise(lit(None)))
    .withColumn("City", initcap(trim(col("City"))))
)
df_customers.write.mode("overwrite").format("delta").saveAsTable("silver_customers")

# =============================================================================
# SILVER - Clean Orders
# =============================================================================
df_orders = (
    df_orders_raw
    .withColumnRenamed("order_id", "OrderID")
    .withColumnRenamed("customer_id", "CustomerID")
    .withColumnRenamed("order_date", "OrderDate")
    .withColumnRenamed("amount", "OrderAmount")
    .withColumnRenamed("status", "OrderStatus")
    .withColumn("OrderDate", coalesce(
        to_date(col("OrderDate"), "yyyy-MM-dd"),
        to_date(col("OrderDate"), "yyyy/MM/dd"),
        to_date(col("OrderDate"), "dd-MM-yyyy"),
        to_date(col("OrderDate"), "dd/MM/yyyy"),
        to_date(col("OrderDate"), "yyyyMMdd")
    ))
    .withColumn("OrderAmount", col("OrderAmount").cast(DoubleType()))
    .fillna({"OrderAmount": 0.0})
    .withColumn("OrderStatus", initcap(trim(col("OrderStatus"))))
)
df_orders.write.mode("overwrite").format("delta").saveAsTable("silver_orders")

# =============================================================================
# SILVER - Clean Payments
# =============================================================================
df_payments = (
    df_payments_raw
    .withColumnRenamed("payment_id", "PaymentID")
    .withColumnRenamed("customer_id", "CustomerID")
    .withColumnRenamed("payment_date", "PaymentDate")
    .withColumnRenamed("payment_method", "PaymentMethod")
    .withColumnRenamed("payment_status", "PaymentStatus")
    .withColumnRenamed("amount", "PaymentAmount")
    .withColumn("PaymentDate", coalesce(
        to_date(col("PaymentDate"), "yyyy-MM-dd"),
        to_date(col("PaymentDate"), "yyyy/MM/dd"),
        to_date(col("PaymentDate"), "dd-MM-yyyy"),
        to_date(col("PaymentDate"), "yyyyMMdd")
    ))
    .withColumn("PaymentMethod", when(lower(col("PaymentMethod")).isin("creditcard", "credit card"), lit("Credit Card")).otherwise(initcap(trim(col("PaymentMethod")))))
    .withColumn("PaymentStatus", initcap(trim(col("PaymentStatus"))))
    .withColumn("PaymentAmount", col("PaymentAmount").cast(DoubleType()))
    .fillna({"PaymentAmount": 0.0})
)
df_payments.write.mode("overwrite").format("delta").saveAsTable("silver_payments")

# =============================================================================
# SILVER - Clean Support Tickets
# =============================================================================
df_tickets = (
    df_tickets_raw
    .withColumnRenamed("ticket_id", "TicketID")
    .withColumnRenamed("customer_id", "CustomerID")
    .withColumnRenamed("issue_type", "IssueType")
    .withColumnRenamed("ticket_date", "TicketDate")
    .withColumnRenamed("resolution_status", "ResolutionStatus")
    .withColumn("TicketDate", coalesce(
        to_date(col("TicketDate"), "yyyy-MM-dd"),
        to_date(col("TicketDate"), "yyyy/MM/dd"),
        to_date(col("TicketDate"), "dd-MM-yyyy"),
        to_date(col("TicketDate"), "yyyyMMdd")
    ))
    .withColumn("IssueType", initcap(trim(col("IssueType"))))
    .withColumn("ResolutionStatus", initcap(trim(col("ResolutionStatus"))))
)
df_tickets.write.mode("overwrite").format("delta").saveAsTable("silver_tickets")

# =============================================================================
# SILVER - Clean Web Activities
# =============================================================================
df_web = (
    df_web_raw
    .withColumnRenamed("session_id", "SessionID")
    .withColumnRenamed("customer_id", "CustomerID")
    .withColumnRenamed("page_viewed", "PageViewed")
    .withColumnRenamed("session_time", "SessionTime")
    .withColumnRenamed("device_type", "DeviceType")
    .withColumn("SessionTime", coalesce(
        to_date(col("SessionTime"), "yyyy-MM-dd"),
        to_date(col("SessionTime"), "yyyy/MM/dd"),
        to_date(col("SessionTime"), "dd-MM-yyyy"),
        to_date(col("SessionTime"), "yyyyMMdd")
    ))
    .withColumn("PageViewed", lower(trim(col("PageViewed"))))
    .withColumn("DeviceType", initcap(trim(col("DeviceType"))))
)
df_web.write.mode("overwrite").format("delta").saveAsTable("silver_web")

# =============================================================================
# GOLD - Join All Tables
# =============================================================================
customers = spark.table("silver_customers").alias("c")
orders = spark.table("silver_orders").alias("o")
payments = spark.table("silver_payments").alias("p")
tickets = spark.table("silver_tickets").alias("t")
web = spark.table("silver_web").alias("w")

gold = (
    customers
    .join(orders, col("c.CustomerID") == col("o.CustomerID"), "left")
    .join(payments, col("c.CustomerID") == col("p.CustomerID"), "left")
    .join(tickets, col("c.CustomerID") == col("t.CustomerID"), "left")
    .join(web, col("c.CustomerID") == col("w.CustomerID"), "left")
    .select(
        col("c.CustomerID"), col("c.Name"), col("c.Email"), col("c.Gender"), col("c.DOB"), col("c.City"),
        col("o.OrderID"), col("o.OrderDate"), col("o.OrderAmount"), col("o.OrderStatus"),
        col("p.PaymentID"), col("p.PaymentDate"), col("p.PaymentMethod"), col("p.PaymentStatus"), col("p.PaymentAmount"),
        col("t.TicketID"), col("t.IssueType"), col("t.TicketDate"), col("t.ResolutionStatus"),
        col("w.SessionID"), col("w.PageViewed"), col("w.SessionTime"), col("w.DeviceType")
    )
)

gold.write.mode("overwrite").format("delta").saveAsTable("gold_customer_360")
print("Pipeline Complete! Gold table: gold_customer_360")
'''

print("PySpark code for Fabric is stored in variable: pyspark_code")
print("Copy and paste into a Fabric Notebook to run.")