In [0]:
# Databricks notebook source
# MAGIC %md
# MAGIC # 03 — Train Model (TPC‑DS Customer Spend Prediction)
# MAGIC
# MAGIC This notebook now uses:
# MAGIC - Reusable training logic from `src/train.py`
# MAGIC - Automated tests from `tests/test_train.py`
# MAGIC - MLflow experiment tracking
# MAGIC
# MAGIC The output is a fully logged MLflow run ready for Model Registry.


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Configuration

# COMMAND ----------

import mlflow
import mlflow.sklearn
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

catalog = "workspace"
schema = "ml_tpcds"

spark.sql(f"USE CATALOG {catalog}")
spark.sql(f"USE SCHEMA {schema}")

print(f"Using schema: {catalog}.{schema}")


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Load ML-Ready Gold Table

# COMMAND ----------

gold_df = spark.table(f"{catalog}.{schema}.customer_features_gold")

print(f"Loaded {gold_df.count():,} rows from Gold table.")
display(gold_df.limit(10))


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Prepare train/test

# COMMAND ----------

# Prepare train/test split
features = [
    'num_transactions', 'total_quantity', 'avg_sales_price', 'avg_discount',
    'num_categories_bought', 'days_since_last_purchase'
]
target = 'total_spend'

gold_pd = gold_df.select(features + [target]).toPandas()

# Convert Decimal columns to float for serialization
for col in ['avg_sales_price', 'avg_discount', 'total_spend']:
    gold_pd[col] = gold_pd[col].astype(float)

X = gold_pd[features]
y = gold_pd[target]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Train Model with MLflow

# COMMAND ----------
mlflow.set_experiment("/Shared/tpcds_ml_experiment")

from mlflow.models import infer_signature

# Ensure input_example and signature are inferred from a sample with missing values
X_train_with_missing = X_train.copy()
for col in X_train_with_missing.select_dtypes(include='int').columns:
    X_train_with_missing.loc[X_train_with_missing.sample(frac=0.01, random_state=42).index, col] = pd.NA
X_train_with_missing = X_train_with_missing.astype({col: 'float' for col in X_train_with_missing.select_dtypes(include='int').columns})

with mlflow.start_run(run_name="rf_customer_spend"):

    # Model
    model = RandomForestRegressor(
        n_estimators=200,
        max_depth=10,
        random_state=42
    )

    model.fit(X_train, y_train)

    # Predictions
    preds = model.predict(X_test)

    # Metrics
    mse = mean_squared_error(y_test, preds) #removed squared argument due to current version of scikit-learn not supporting squared
    rmse = mse ** 0.5

    # Log params + metrics
    mlflow.log_param("n_estimators", 200)
    mlflow.log_param("max_depth", 10)
    mlflow.log_metric("rmse", rmse)

    # Infer signature and set input_example using realistic sample with missing values
    signature = infer_signature(X_train_with_missing, model.predict(X_train))
    input_example = X_train_with_missing.iloc[:5]

    # Log model
    mlflow.sklearn.log_model(
        sk_model=model,
        artifact_path="model",
        registered_model_name=None,  # registry happens in next notebook
        signature=signature,
        input_example=input_example
    )

    print(f"RMSE: {rmse:.4f}")

In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC ## Evaluate Model

# COMMAND ----------

import matplotlib.pyplot as plt

plt.scatter(y_test, preds, alpha=0.3)
plt.xlabel("Actual Spend")
plt.ylabel("Predicted Spend")
plt.title("Predicted vs Actual Spend")
plt.show()


In [0]:
# COMMAND ----------
# MAGIC %md
# MAGIC # Model Training Complete
# MAGIC 
# MAGIC Your model is now logged in MLflow.  
# MAGIC Next step: Register it in the Model Registry and promote it to Staging/Production.
