# Data Transformation Notebook
This notebook performs data transformation from Bronze to Silver layer.

## Transformation Steps
1. Data cleaning and standardization
2. Data type conversions
3. Business rule applications
4. Data enrichment
5. Write to Silver layer

In [None]:
# Import required libraries
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import *
from pyspark.sql.window import Window
from datetime import datetime
import re

In [None]:
# Initialize Spark session
spark = SparkSession.builder.appName("DataTransformation").getOrCreate()
print(f"Transformation started at: {datetime.now()}")

In [None]:
# Configuration
storage_account_name = "your_storage_account_name"
container_name = "synapsefs"
bronze_data_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/bronze/"
silver_data_path = f"abfss://{container_name}@{storage_account_name}.dfs.core.windows.net/silver/"

print(f"Bronze data path: {bronze_data_path}")
print(f"Silver data path: {silver_data_path}")

## Read Bronze Data

In [None]:
# Read bronze data
customer_bronze = spark.read.parquet(f"{bronze_data_path}customers/")
product_bronze = spark.read.parquet(f"{bronze_data_path}products/")
sales_bronze = spark.read.parquet(f"{bronze_data_path}sales/")

print(f"Loaded Bronze data:")
print(f"Customers: {customer_bronze.count()}")
print(f"Products: {product_bronze.count()}")
print(f"Sales: {sales_bronze.count()}")

## Transform Customer Data

In [None]:
# Clean and standardize customer data
customer_silver = customer_bronze \
    .dropDuplicates(["CustomerID"]) \
    .withColumn("CustomerName", trim(col("CustomerName"))) \
    .withColumn("Email", lower(trim(col("Email")))) \
    .withColumn("Phone", regexp_replace(col("Phone"), "[^0-9]", "")) \
    .withColumn("City", initcap(trim(col("City")))) \
    .withColumn("State", upper(trim(col("State")))) \
    .withColumn("Country", upper(trim(col("Country")))) \
    .withColumn("Region", initcap(trim(col("Region"))))

# Add validation flags
customer_silver = customer_silver \
    .withColumn("email_valid", 
                when(col("Email").rlike("^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}$"), True)
                .otherwise(False)) \
    .withColumn("phone_valid",
                when(length(col("Phone")) >= 10, True)
                .otherwise(False))

# Add transformation metadata
customer_silver = customer_silver \
    .withColumn("transformation_date", current_timestamp()) \
    .withColumn("data_quality_score",
                (col("email_valid").cast("int") + col("phone_valid").cast("int")) * 50)

print(f"Transformed {customer_silver.count()} customer records")
customer_silver.show(5)

## Transform Product Data

In [None]:
# Clean and standardize product data
product_silver = product_bronze \
    .dropDuplicates(["ProductID"]) \
    .withColumn("ProductName", trim(col("ProductName"))) \
    .withColumn("Category", initcap(trim(col("Category")))) \
    .withColumn("SubCategory", initcap(trim(col("SubCategory")))) \
    .withColumn("Brand", initcap(trim(col("Brand"))))

# Add calculated columns
product_silver = product_silver \
    .withColumn("profit_margin",
                when(col("UnitPrice") > 0,
                     ((col("UnitPrice") - col("StandardCost")) / col("UnitPrice")) * 100)
                .otherwise(0)) \
    .withColumn("price_category",
                when(col("UnitPrice") < 50, "Budget")
                .when(col("UnitPrice") < 200, "Standard")
                .when(col("UnitPrice") < 500, "Premium")
                .otherwise("Luxury"))

# Add transformation metadata
product_silver = product_silver \
    .withColumn("transformation_date", current_timestamp())

print(f"Transformed {product_silver.count()} product records")
product_silver.show(5)

## Transform Sales Data

In [None]:
# Clean and enrich sales data
sales_silver = sales_bronze \
    .dropDuplicates(["OrderID", "OrderLineNumber"])

# Add date components
sales_silver = sales_silver \
    .withColumn("order_year", year(col("OrderDate"))) \
    .withColumn("order_month", month(col("OrderDate"))) \
    .withColumn("order_quarter", quarter(col("OrderDate"))) \
    .withColumn("order_day_of_week", dayofweek(col("OrderDate"))) \
    .withColumn("order_day_name", date_format(col("OrderDate"), "EEEE"))

# Add calculated measures
sales_silver = sales_silver \
    .withColumn("gross_revenue", col("Quantity") * col("UnitPrice")) \
    .withColumn("net_revenue", (col("Quantity") * col("UnitPrice")) - col("DiscountAmount")) \
    .withColumn("total_cost", col("Quantity") * col("UnitCost")) \
    .withColumn("gross_profit", 
                ((col("Quantity") * col("UnitPrice")) - col("DiscountAmount")) - (col("Quantity") * col("UnitCost"))) \
    .withColumn("profit_margin_pct",
                when(col("net_revenue") > 0,
                     (col("gross_profit") / col("net_revenue")) * 100)
                .otherwise(0))

# Add business flags
sales_silver = sales_silver \
    .withColumn("is_weekend",
                when(col("order_day_of_week").isin([1, 7]), True)
                .otherwise(False)) \
    .withColumn("is_high_value",
                when(col("net_revenue") > 1000, True)
                .otherwise(False)) \
    .withColumn("has_discount",
                when(col("DiscountAmount") > 0, True)
                .otherwise(False))

# Add transformation metadata
sales_silver = sales_silver \
    .withColumn("transformation_date", current_timestamp())

print(f"Transformed {sales_silver.count()} sales records")
sales_silver.show(5)

## Add Customer Segmentation

In [None]:
# Calculate customer metrics for segmentation
customer_metrics = sales_silver \
    .groupBy("CustomerKey") \
    .agg(
        count("OrderID").alias("total_orders"),
        sum("net_revenue").alias("total_revenue"),
        avg("net_revenue").alias("avg_order_value"),
        max("OrderDate").alias("last_order_date"),
        min("OrderDate").alias("first_order_date")
    ) \
    .withColumn("customer_lifetime_days",
                datediff(current_date(), col("first_order_date"))) \
    .withColumn("days_since_last_order",
                datediff(current_date(), col("last_order_date")))

# Add customer segment based on RFM-like analysis
customer_metrics = customer_metrics \
    .withColumn("customer_segment",
                when((col("days_since_last_order") < 90) & (col("total_orders") >= 10) & (col("total_revenue") > 5000), "Champion")
                .when((col("days_since_last_order") < 90) & (col("total_orders") >= 5), "Loyal")
                .when((col("days_since_last_order") < 180) & (col("total_revenue") > 2000), "Potential Loyalist")
                .when((col("days_since_last_order") < 90) & (col("total_orders") < 5), "Recent")
                .when((col("days_since_last_order") >= 180) & (col("days_since_last_order") < 365), "At Risk")
                .when(col("days_since_last_order") >= 365, "Lost")
                .otherwise("New"))

print("Customer segmentation completed")
customer_metrics.groupBy("customer_segment").count().show()

## Write to Silver Layer

In [None]:
# Write customer data to Silver layer
customer_silver.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("Country", "Region") \
    .save(f"{silver_data_path}customers/")

print("Customer data written to Silver layer")

In [None]:
# Write product data to Silver layer
product_silver.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("Category") \
    .save(f"{silver_data_path}products/")

print("Product data written to Silver layer")

In [None]:
# Write sales data to Silver layer
sales_silver.write \
    .mode("overwrite") \
    .format("delta") \
    .partitionBy("order_year", "order_month") \
    .save(f"{silver_data_path}sales/")

print("Sales data written to Silver layer")

In [None]:
# Write customer metrics to Silver layer
customer_metrics.write \
    .mode("overwrite") \
    .format("delta") \
    .save(f"{silver_data_path}customer_metrics/")

print("Customer metrics written to Silver layer")

## Data Quality Report

In [None]:
# Generate data quality report
print("=" * 70)
print("Data Transformation Summary")
print("=" * 70)
print(f"Transformation completed at: {datetime.now()}")
print("\nRecords Processed:")
print(f"  Customers: {customer_silver.count()}")
print(f"  Products: {product_silver.count()}")
print(f"  Sales: {sales_silver.count()}")
print("\nData Quality:")
print(f"  Valid emails: {customer_silver.filter(col('email_valid') == True).count()}")
print(f"  Valid phones: {customer_silver.filter(col('phone_valid') == True).count()}")
print(f"  High-value orders: {sales_silver.filter(col('is_high_value') == True).count()}")
print(f"  Orders with discounts: {sales_silver.filter(col('has_discount') == True).count()}")
print("=" * 70)