# DataAnalytics — PySpark **DataFrame API** 

This notebook mirrors the analyses from *DataAnalytics.ipynb* but uses **PySpark DataFrame API** .  
Assumes the **Gold** layer is available as Parquet at `GCS_BASE/gold/*` **or** as tables/views. The first cell normalizes inputs into three DataFrames:
`dim_customers`, `dim_products`, `fact_sales`.

In [None]:
from pyspark.sql import SparkSession, functions as F
from pyspark.sql.window import Window
spark = SparkSession.builder.appName("gold-analytics-dfapi").getOrCreate()

# Locate sources: prefer tables/views; otherwise read Parquet (set GCS_BASE env accordingly).
import os
GCS_BASE = os.getenv("GCS_BASE", "/tmp/dw_medallion")

def get_df(primary_table, fallback_view, parquet_name):
    df = None
    for name in [primary_table, fallback_view]:
        try:
            df = spark.table(name)
            _ = df.limit(1).count()
            return df
        except Exception:
            pass
    # Parquet fallback
    path = f"{GCS_BASE}/gold/{parquet_name}"
    try:
        return spark.read.parquet(path)
    except Exception as e:
        raise RuntimeError(f"Could not find {primary_table}/{fallback_view} or {path}. Error: {e}")

dim_customers = get_df("gold.dim_customers", "gold_dim_customers", "dim_customers")
dim_products  = get_df("gold.dim_products",  "gold_dim_products",  "dim_products")
fact_sales    = get_df("gold.fact_sales",    "gold_fact_sales",    "fact_sales")

print("DFs ready:", dim_customers.count(), "customers |", dim_products.count(), "products |", fact_sales.count(), "fact rows")

## 02 — Dimensions Exploration

**Unique countries (customer origins).**

In [None]:
(dim_customers
 .select("country").distinct()
 .orderBy("country")
 .show(50, truncate=False))

**Unique category → subcategory → product triples.**

In [None]:
(dim_products
 .select("category","subcategory","product_name").distinct()
 .orderBy("category","subcategory","product_name")
 .show(50, truncate=False))

## 03 — Date Range Exploration

**First & last order date; coverage in months.**

In [None]:
minmax = fact_sales.select(F.min("order_date").alias("first_order_date"),
                           F.max("order_date").alias("last_order_date"))
res = (minmax
       .withColumn("order_range_months",
                   F.floor(F.months_between(F.col("last_order_date"), F.col("first_order_date")))))
res.show(truncate=False)

**Youngest & oldest customer (by birthdate) + approximate ages.**

In [None]:
minmax = dim_customers.select(F.min("birthdate").alias("oldest_birthdate"),
                              F.max("birthdate").alias("youngest_birthdate"))
res = (minmax
       .withColumn("oldest_age",   F.floor(F.months_between(F.current_date(), F.col("oldest_birthdate"))/12))
       .withColumn("youngest_age", F.floor(F.months_between(F.current_date(), F.col("youngest_birthdate"))/12)))
res.show(truncate=False)

## 04 — Measures Exploration

**Total sales, total quantity, average price.**

In [None]:
(fact_sales.agg(F.sum("sales_amount").alias("total_sales"),
                F.sum("quantity").alias("total_quantity"),
                F.avg("price").alias("avg_price"))
 .show(truncate=False))

**Total orders (raw & distinct).**

In [None]:
(fact_sales.agg(F.count("order_number").alias("total_orders"),
                F.countDistinct("order_number").alias("total_orders_distinct"))
 .show(truncate=False))

**Total products (distinct names).**

In [None]:
(dim_products.agg(F.countDistinct("product_name").alias("total_products"))
 .show(truncate=False))

**Total customers (dimension rows) & active customers (placed ≥1 order).**

In [None]:
dim_customers.select(F.count("customer_key").alias("total_customers")).show(truncate=False)
fact_sales.select(F.countDistinct("customer_key").alias("active_customers")).show(truncate=False)

**One-row KPI rollup.**

In [None]:
from pyspark.sql import functions as F

# 1) Aggregate once from each source
fs_metrics = fact_sales.agg(
    F.sum("sales_amount").cast("double").alias("Total Sales"),
    F.sum("quantity").cast("double").alias("Total Quantity"),
    F.avg("price").cast("double").alias("Average Price"),
    F.countDistinct("order_number").cast("double").alias("Total Orders"),
)

prod_metrics = dim_products.agg(F.countDistinct("product_name").cast("double").alias("Total Products"))
cust_metrics = dim_customers.agg(F.count("customer_key").cast("double").alias("Total Customers"))

# 2) Bring them onto one row
all_metrics = fs_metrics.crossJoin(prod_metrics).crossJoin(cust_metrics)

# 3) Convert the wide row to long (measure_name, measure_value)
kpis = all_metrics.selectExpr(
    "stack(6, "
    "'Total Sales', `Total Sales`, "
    "'Total Quantity', `Total Quantity`, "
    "'Average Price', `Average Price`, "
    "'Total Orders', `Total Orders`, "
    "'Total Products', `Total Products`, "
    "'Total Customers', `Total Customers`"
    ") as (measure_name, measure_value)"
)

kpis_display = kpis.withColumn(
    "measure_value",
    F.format_number(F.col("measure_value"), 2)  # 2 decimal places
)

kpis_display.show(truncate=False)

## 05 — Magnitude Analysis

**Customers by country & gender.**

In [None]:
(dim_customers.groupBy("country").agg(F.count("customer_key").alias("total_customers"))
 .orderBy(F.desc("total_customers"))).show(50, truncate=False)

(dim_customers.groupBy("gender").agg(F.count("customer_key").alias("total_customers"))
 .orderBy(F.desc("total_customers"))).show(50, truncate=False)

**Products by category & average cost by category.**

In [None]:
(dim_products.groupBy("category").agg(F.count("product_key").alias("total_products"))
 .orderBy(F.desc("total_products"))).show(50, truncate=False)

(dim_products.groupBy("category").agg(F.avg("cost").alias("avg_cost"))
 .orderBy(F.desc("avg_cost"))).show(50, truncate=False)

**Revenue by category.**

In [None]:
(fact_sales.alias("f")
 .join(dim_products.alias("p"), F.col("p.product_key")==F.col("f.product_key"), "left")
 .groupBy("p.category").agg(F.sum("f.sales_amount").alias("total_revenue"))
 .orderBy(F.desc("total_revenue"))
 .show(50, truncate=False))

**Revenue by customer.**

In [None]:
(fact_sales.alias("f")
 .join(dim_customers.alias("c"), F.col("c.customer_key")==F.col("f.customer_key"), "left")
 .groupBy("c.customer_key","c.first_name","c.last_name")
 .agg(F.sum("f.sales_amount").alias("total_revenue"))
 .orderBy(F.desc("total_revenue"))
 .show(50, truncate=False))

**Units sold by country.**

In [None]:
(fact_sales.alias("f")
 .join(dim_customers.alias("c"), F.col("c.customer_key")==F.col("f.customer_key"), "left")
 .groupBy("c.country")
 .agg(F.sum("f.quantity").alias("total_sold_items"))
 .orderBy(F.desc("total_sold_items"))
 .show(50, truncate=False))

## 06 — Ranking Analysis

**Top 5 products by revenue (simple and via RANK window).**

In [None]:
prod_rev = (fact_sales.alias("f")
            .join(dim_products.alias("p"), F.col("p.product_key")==F.col("f.product_key"), "left")
            .groupBy("p.product_name")
            .agg(F.sum("f.sales_amount").alias("total_revenue")))

prod_rev.orderBy(F.desc("total_revenue")).limit(5).show(truncate=False)

w = Window.orderBy(F.desc("total_revenue"))
(prod_rev
 .withColumn("rank_products", F.rank().over(w))
 .filter(F.col("rank_products") <= 5)
 .orderBy("rank_products","product_name")
 .show(truncate=False))

**Bottom 5 products by revenue; Top 10 customers by revenue; 3 customers with fewest orders.**

In [None]:
prod_rev.orderBy(F.asc("total_revenue")).limit(5).show(truncate=False)

cust_rev = (fact_sales.alias("f")
            .join(dim_customers.alias("c"), F.col("c.customer_key")==F.col("f.customer_key"), "left")
            .groupBy("c.customer_key","c.first_name","c.last_name")
            .agg(F.sum("f.sales_amount").alias("total_revenue"))
           )
cust_rev.orderBy(F.desc("total_revenue")).limit(10).show(truncate=False)

cust_orders = (fact_sales.groupBy("customer_key")
               .agg(F.countDistinct("order_number").alias("total_orders")))
(cust_orders
 .join(dim_customers, "customer_key", "left")
 .orderBy(F.asc("total_orders"))
 .select("customer_key","first_name","last_name","total_orders")
 .limit(3)
 .show(truncate=False))

## 07 — Change Over Time Analysis

**Year/Month rollup and month truncation & labels.**

In [None]:
(fact_sales
 .filter(F.col("order_date").isNotNull())
 .groupBy(F.year("order_date").alias("order_year"),
          F.month("order_date").alias("order_month"))
 .agg(F.sum("sales_amount").alias("total_sales"),
      F.countDistinct("customer_key").alias("total_customers"),
      F.sum("quantity").alias("total_quantity"))
 .orderBy("order_year","order_month")
 .show(100, truncate=False))

(fact_sales
 .filter(F.col("order_date").isNotNull())
 .groupBy(F.date_trunc("month","order_date").alias("order_month"))
 .agg(F.sum("sales_amount").alias("total_sales"),
      F.countDistinct("customer_key").alias("total_customers"),
      F.sum("quantity").alias("total_quantity"))
 .orderBy("order_month")
 .show(100, truncate=False))

(fact_sales
 .filter(F.col("order_date").isNotNull())
 .groupBy(F.date_format("order_date","yyyy-MMM").alias("order_month_label"))
 .agg(F.sum("sales_amount").alias("total_sales"),
      F.countDistinct("customer_key").alias("total_customers"),
      F.sum("quantity").alias("total_quantity"))
 .orderBy("order_month_label")
 .show(100, truncate=False))

## 08 — Cumulative Analysis

**Yearly sales with running total and moving average price.**

In [None]:
yearly = (fact_sales
          .filter(F.col("order_date").isNotNull())
          .groupBy(F.date_trunc("year","order_date").alias("order_year"))
          .agg(F.sum("sales_amount").alias("total_sales"),
               F.avg("price").alias("avg_price"))
         )

w = Window.orderBy("order_year").rowsBetween(Window.unboundedPreceding, Window.currentRow)
(yearly
 .withColumn("running_total_sales", F.sum("total_sales").over(w))
 .withColumn("moving_average_price", F.avg("avg_price").over(w))
 .orderBy("order_year")
 .show(100, truncate=False))

## 09 — Performance Analysis (YoY & vs Average)

**Yearly product sales with YoY deltas and above/below-average flags.**

In [None]:
yps = (fact_sales.alias("f")
       .join(dim_products.alias("p"), F.col("f.product_key")==F.col("p.product_key"), "left")
       .filter(F.col("f.order_date").isNotNull())
       .groupBy(F.year("f.order_date").alias("order_year"), F.col("p.product_name"))
       .agg(F.sum("f.sales_amount").alias("current_sales"))
      )

w_prod = Window.partitionBy("product_name").orderBy("order_year")
(yps
 .withColumn("avg_sales", F.avg("current_sales").over(Window.partitionBy("product_name")))
 .withColumn("diff_avg", F.col("current_sales") - F.avg("current_sales").over(Window.partitionBy("product_name")))
 .withColumn("avg_change", F.when(F.col("diff_avg")>0,"Above Avg").when(F.col("diff_avg")<0,"Below Avg").otherwise("Avg"))
 .withColumn("py_sales", F.lag("current_sales").over(w_prod))
 .withColumn("diff_py", F.col("current_sales") - F.col("py_sales"))
 .withColumn("py_change", F.when(F.col("diff_py")>0,"Increase").when(F.col("diff_py")<0,"Decrease").otherwise("No Change"))
 .orderBy("product_name","order_year")
 .show(200, truncate=False))

## 10 — Data Segmentation

**Product cost bands (counts by segment).**

In [None]:
segments = (dim_products
            .withColumn("cost_range",
                F.when(F.col("cost") < 100, "Below 100")
                 .when((F.col("cost") >= 100) & (F.col("cost") <= 500), "100-500")
                 .when((F.col("cost") > 500) & (F.col("cost") <= 1000), "500-1000")
                 .otherwise("Above 1000"))
           )
(segments.groupBy("cost_range").agg(F.count("product_key").alias("total_products"))
 .orderBy(F.desc("total_products"))
 .show(truncate=False))

**Customer tenure & spend segmentation (VIP, Regular, New).**

In [None]:
cust_spend = (fact_sales
              .groupBy("customer_key")
              .agg(F.sum("sales_amount").alias("total_spending"),
                   F.min("order_date").alias("first_order"),
                   F.max("order_date").alias("last_order"),
                   F.floor(F.months_between(F.max("order_date"), F.min("order_date"))).alias("lifespan_months"))
             )
seg = (cust_spend
       .withColumn("customer_segment",
           F.when( (F.col("lifespan_months")>=12) & (F.col("total_spending")>5000), "VIP")
            .when( (F.col("lifespan_months")>=12) & (F.col("total_spending")<=5000), "Regular")
            .otherwise("New"))
      )
(seg.groupBy("customer_segment").agg(F.count("customer_key").alias("total_customers"))
 .orderBy(F.desc("total_customers"))
 .show(truncate=False))

## 11 — Part-to-Whole Analysis

**Category share of total sales.**

In [None]:
cat_sales = (fact_sales.alias("f")
             .join(dim_products.alias("p"), F.col("p.product_key")==F.col("f.product_key"), "left")
             .groupBy("p.category").agg(F.sum("f.sales_amount").alias("total_sales")))

w_all = Window.rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)
res = (cat_sales
       .withColumn("overall_sales", F.sum("total_sales").over(w_all))
       .withColumn("percentage_of_total", F.round( (F.col("total_sales")/F.col("overall_sales"))*100, 2))
       .orderBy(F.desc("total_sales")))
res.show(truncate=False)