In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 02 — Feature Engineering (TPC‑DS SF1000)
# MAGIC
# MAGIC This notebook transforms Bronze TPC‑DS tables into engineered features for ML.
# MAGIC
# MAGIC **Enhancements**
# MAGIC - Uses reusable feature logic from `src/features.py`
# MAGIC - Runs automated tests from `tests/test_features.py`
# MAGIC - Writes Silver and Gold feature tables
# MAGIC - Produces a clean ML‑ready dataset
# COMMAND ----------

# MAGIC %md
# MAGIC ## Imports & Setup

# COMMAND ----------


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------
import sys
sys.path.append('/Workspace/Users/mcthurgood20@gmail.com/Databricks-MLflow-Workflow/tpc-ds')

# Import shared utilities
from src.features import build_customer_features
# Import tests (pytest-style functions)
from tests.test_features import test_feature_columns_exist
 
import pyspark.sql.functions as F


catalog = "workspace"
schema = "ml_tpcds"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

print(f"Using schema: {catalog}.{schema}")


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Load Bronze Tables
# COMMAND ----------

store_sales = spark.table(f"{catalog}.{schema}.store_sales_bronze")
customer = spark.table(f"{catalog}.{schema}.customer_bronze")
item = spark.table(f"{catalog}.{schema}.item_bronze")
date_dim = spark.table(f"{catalog}.{schema}.date_dim_bronze")

print("Loaded Bronze tables.")


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Build Unified Silver Sales Table
# MAGIC
# MAGIC This step remains notebook‑specific because it joins multiple tables
# MAGIC into a single sales view before feature engineering.
# COMMAND ----------

sales_joined = (
    store_sales.alias("s")
    .join(customer.alias("c"), F.col("s.ss_customer_sk") == F.col("c.c_customer_sk"), "inner")
    .join(item.alias("i"), F.col("s.ss_item_sk") == F.col("i.i_item_sk"), "inner")
    .join(date_dim.alias("d"), F.col("s.ss_sold_date_sk") == F.col("d.d_date_sk"), "inner")
    .filter(F.col("c.c_birth_year") > 1980)  # compute‑friendly filter
    .select(
        "c.c_customer_sk",
        "c.c_first_name",
        "c.c_last_name",
        "c.c_birth_year",
        "c.c_birth_country",
        "i.i_category",
        "i.i_class",
        "s.ss_customer_sk",
        "s.ss_quantity",
        "s.ss_sales_price",
        "s.ss_ext_discount_amt",
        "s.ss_ext_list_price",
        "d.d_date",
        "d.d_day_name",
        "d.d_month_seq",
        "d.d_year"
    )
)

sales_joined.write.format("delta").mode("overwrite").saveAsTable(
       f"{catalog}.{schema}.sales_silver"
   )


print("Unified Silver sales table created.")

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Build Customer‑Level Features (Using `src.features`)
# COMMAND ----------

# Reload Silver table
sales_silver = spark.table(f"{catalog}.{schema}.sales_silver")

# Use your reusable feature builder
customer_features = build_customer_features(
    store_sales=sales_silver,   # using unified sales view
    customer=customer,
    date_dim=date_dim
)

customer_features.write.format("delta").option("mergeSchema", "true").mode("overwrite").saveAsTable(
    f"{catalog}.{schema}.customer_features_silver"
)

print("Customer feature table created.")

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Build Final ML‑Ready Gold Dataset
# COMMAND ----------

customer_features = spark.table(f"{catalog}.{schema}.customer_features_silver")

ml_ready = (
    customer.alias("c")
    .join(customer_features.alias("f"), F.col("c.c_customer_sk") == F.col("f.c_customer_sk"), "inner")
    .select(
        "c.c_customer_sk",
        "c.c_first_name",
        "c.c_last_name",
        "c.c_birth_year",
        "c.c_birth_country",
        "f.num_transactions",
        "f.total_quantity",
        "f.total_spend",
        "f.avg_sales_price",
        "f.avg_discount",
        "f.num_categories_bought",
        "f.days_since_last_purchase"
    )
)

ml_ready.write.format("delta").mode("overwrite").saveAsTable(
    f"{catalog}.{schema}.customer_features_gold"
)

print("ML‑ready Gold feature table created.")


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Preview Final Features

# COMMAND ----------

display(ml_ready.limit(20))


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC # Feature Engineering Complete
# MAGIC 
# MAGIC Your Silver and Gold feature tables are now ready for model training.
