In [0]:
from pyspark.sql.functions import col, sum as _sum, row_number
from pyspark.sql.window import Window

# Load cleaned sales Delta table or DataFrame
sales_df = spark.table("sales_delta_cleaned")  # Or replace with cleaned DataFrame if needed


In [0]:
customers_df = spark.read.format("csv") \
    .option("header", True) \
    .option("inferSchema", True) \
    .load("dbfs:/FileStore/shared_uploads/parveen.r@live.com/customer_demographics.csv")

In [0]:
# Show sample customer data
print("Customer demographics sample:")
display(customers_df.limit(5))


In [0]:
# Join sales with customers on CustomerName
joined_df = sales_df.join(customers_df, on="CustomerName", how="inner")
print("Joined data sample:")
display(joined_df.limit(5))

In [0]:
# Aggregation: total sales and quantity by Region
agg_region_df = joined_df.groupBy("Region").agg(
    _sum(col("Quantity")).alias("TotalQuantity"),
    _sum(col("Quantity") * col("UnitPrice")).alias("TotalSales")
).orderBy(col("TotalSales").desc())

print("Aggregated sales by region:")
display(agg_region_df.limit(5))

In [0]:
# Window function: rank customers by total sales within each region
window_spec = Window.partitionBy("Region").orderBy(col("TotalSales").desc())

customer_sales_df = joined_df.groupBy("Region", "CustomerName").agg(
    _sum(col("Quantity") * col("UnitPrice")).alias("TotalSales")
)

ranked_df = customer_sales_df.withColumn("SalesRank", row_number().over(window_spec))

print("Top customers ranked by sales within each region:")
display(ranked_df.orderBy("Region", "SalesRank").limit(20))