In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 03 — Train Model (TPC‑DS Customer Spend Prediction)
# MAGIC 
# MAGIC This notebook trains a machine learning model using the engineered features
# MAGIC created in the previous step.
# MAGIC 
# MAGIC **Goals**
# MAGIC - Load Gold feature table
# MAGIC - Prepare train/validation datasets
# MAGIC - Train a regression model (Random Forest)
# MAGIC - Track experiments with MLflow
# MAGIC - Log parameters, metrics, and artifacts
# MAGIC 
# MAGIC The output of this notebook will be a fully logged MLflow run that can be
# MAGIC registered in the Model Registry.

# COMMAND ----------


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------

import mlflow
import mlflow.sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

catalog = "workspace"
schema = "ml_tpcds"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

print(f"Using schema: {catalog}.{schema}")


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Load ML-Ready Gold Table

# COMMAND ----------

gold_df = spark.table(f"{catalog}.{schema}.customer_features_gold")

print(f"Loaded {gold_df.count():,} rows from Gold table.")
display(gold_df.limit(10))


In [0]:
pdf = gold_df.toPandas()
pdf.head()
pdf.shape


In [0]:
set(pdf.columns)


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Prepare train/test

# COMMAND ----------

# Prepare train/test split
features = [
    'num_transactions', 'total_quantity', 'avg_sales_price', 'avg_discount',
    'num_categories_bought', 'days_since_last_purchase'
]
target = 'total_spend'

gold_pd = gold_df.select(features + [target]).toPandas()
X = gold_pd[features]
y = gold_pd[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Train Model with MLflow

# COMMAND ----------
mlflow.set_experiment("/Shared/tpcds_ml_experiment")

with mlflow.start_run(run_name="rf_customer_spend"):

    # Model
    # Choose to use the RandomForestRegressor model due to the features being non-linear and interaction-heavy, thus using a Linear Regression model wouldn't make sense. As well, there isn't a need for a lot of fine tuning, like you would need to do with a model like XGBoost. While I have used XGBoost in previous repos. I find RandomForestRegressor to be the best fit.
    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )

    model.fit(X_train, y_train)

    # Predictions
    preds = model.predict(X_test)

    # Metrics
    rmse = mean_squared_error(y_test, preds, squared=False)

    # Log params + metrics
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("rmse", rmse)

    # Log model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=None  # registry happens in next notebook
    )

    print(f"RMSE: {rmse:.4f}")

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Evaluate Model

# COMMAND ----------

import matplotlib.pyplot as plt

plt.scatter(y_test, preds, alpha=0.3)
plt.xlabel("Actual Spend")
plt.ylabel("Predicted Spend")
plt.title("Predicted vs Actual Spend")
plt.show()


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC # Model Training Complete
# MAGIC 
# MAGIC Your model is now logged in MLflow.  
# MAGIC Next step: Register it in the Model Registry and promote it to Staging/Production.
