# Retail Customer Purchase Prediction
**COSC 4368 — Fundamentals of AI**

Team: Matthew Nguyen, Benjamin Tran, Victor Bui, Gustavo Buenrostro  


In [1]:
# 1) Reproducibility & environment
import os, sys, json, time, math, numpy as np, pandas as pd
from pathlib import Path
import matplotlib.pyplot as plt

import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve, StratifiedKFold, cross_val_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import (
    roc_auc_score, f1_score, precision_score, recall_score, confusion_matrix,
    roc_curve, auc, precision_recall_curve, classification_report, brier_score_loss,
    average_precision_score, ConfusionMatrixDisplay
)
from sklearn.calibration import calibration_curve
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

SEED = 42
np.random.seed(SEED)

# --- robust repo paths regardless of notebook location ---
NB_DIR = Path.cwd()  # where this notebook is actually running
DATA_DIR_CANDIDATES = [
    NB_DIR / "data",        # ./data  (if notebook at repo root)
    NB_DIR.parent / "data", # ../data (if notebook in ./notebook)
    Path("/mnt/data"),      # optional container path
]
for d in DATA_DIR_CANDIDATES:
    if d.exists():
        DATA_DIR = d
        break
else:
    raise FileNotFoundError(
        "Couldn't locate a data/ folder. Checked:\n  - " +
        "\n  - ".join(str(p) for p in DATA_DIR_CANDIDATES)
    )

# keep outputs under the notebook folder (matches your repo layout)
OUT_FIG_BASE = NB_DIR / "outputs" / "figures"
OUT_TAB_BASE = NB_DIR / "outputs" / "tables"
OUT_FIG_BASE.mkdir(parents=True, exist_ok=True)
OUT_TAB_BASE.mkdir(parents=True, exist_ok=True)
RUN_STAMP = time.strftime('%Y%m%d-%H%M%S')
OUT_FIG = OUT_FIG_BASE / RUN_STAMP; OUT_FIG.mkdir(parents=True, exist_ok=True)
OUT_TAB = OUT_TAB_BASE / RUN_STAMP; OUT_TAB.mkdir(parents=True, exist_ok=True)

print("sklearn:", sklearn.__version__)
print("NB_DIR:", NB_DIR)
print("DATA_DIR:", DATA_DIR)
print("Save figs →", OUT_FIG)
print("Save tables →", OUT_TAB)
plt.rcParams["figure.dpi"] = 150

# 0) Global plot polish (slide-ready)
import matplotlib as mpl
mpl.rcParams.update({
    "figure.dpi": 150,
    "savefig.dpi": 300,
    "font.size": 12,
    "axes.titlesize": 14,
    "axes.labelsize": 12,
    "legend.fontsize": 10,
    "axes.grid": True,
    "grid.alpha": 0.25,
})
def add_threshold_marker(ax, thr, label=None, ymin=0, ymax=1):
    ax.axvline(thr, linestyle="--")
    if label:
        ax.text(thr, ymax, label, ha="right", va="bottom")


sklearn: 1.7.2
NB_DIR: c:\Users\mttng\Downloads\retail-customer-purchase-prediction\notebook
DATA_DIR: c:\Users\mttng\Downloads\retail-customer-purchase-prediction\data
Save figs → c:\Users\mttng\Downloads\retail-customer-purchase-prediction\notebook\outputs\figures\20251130-232438
Save tables → c:\Users\mttng\Downloads\retail-customer-purchase-prediction\notebook\outputs\tables\20251130-232438


In [2]:
# Data Load + Audit
TARGET = "Revenue"
candidates = [
    DATA_DIR / "online_shoppers_intention.csv",
    DATA_DIR / "Online Shoppers Intention.csv",  # alt name
]
for c in candidates:
    if c.exists():
        path = c
        break
else:
    raise FileNotFoundError(f"online_shoppers_intention.csv not found in {DATA_DIR}")

print("Loading:", path)
df = pd.read_csv(path)
if df[TARGET].dtype != int:
    df[TARGET] = df[TARGET].astype(int)

audit = pd.DataFrame({
    "dtype": df.dtypes.astype(str),
    "n_null": df.isnull().sum(),
    "n_unique": df.nunique()
}).sort_index()
audit.to_csv(OUT_TAB / "data_audit.csv")
audit.head(20)


Loading: c:\Users\mttng\Downloads\retail-customer-purchase-prediction\data\online_shoppers_intention.csv


Unnamed: 0,dtype,n_null,n_unique
Administrative,int64,0,27
Administrative_Duration,float64,0,3335
BounceRates,float64,0,1872
Browser,int64,0,13
ExitRates,float64,0,4777
Informational,int64,0,17
Informational_Duration,float64,0,1258
Month,object,0,10
OperatingSystems,int64,0,8
PageValues,float64,0,2704


# EDA & Leakage Guardrails

In [3]:
# Class balance
counts = df[TARGET].value_counts().sort_index()
counts.to_csv(OUT_TAB / "class_counts.csv")
ax = counts.plot(kind="bar", title="Class Counts")
ax.figure.tight_layout(); ax.figure.savefig(OUT_FIG / "class_counts.png"); plt.close(ax.figure)

# Helpful rates if present
if "Month" in df.columns:
    pr_month = df.groupby("Month")[TARGET].mean().sort_values()
    ax = pr_month.plot(kind="bar", title="Purchase Rate by Month")
    ax.figure.tight_layout(); ax.figure.savefig(OUT_FIG / "purchase_rate_by_month.png"); plt.close(ax.figure)

if "VisitorType" in df.columns:
    pr_vtype = df.groupby("VisitorType")[TARGET].mean().sort_values()
    ax = pr_vtype.plot(kind="bar", title="Purchase Rate by VisitorType")
    ax.figure.tight_layout(); ax.figure.savefig(OUT_FIG / "purchase_rate_by_visitortype.png"); plt.close(ax.figure)

print("Leakage policy: only features known by session end; all transforms happen in Pipeline fit on TRAIN only.")


Leakage policy: only features known by session end; all transforms happen in Pipeline fit on TRAIN only.


In [4]:
#Preprocessing & split
NUM_COLS = df.drop(columns=[TARGET]).select_dtypes(include=["int64","float64","int32","float32"]).columns.tolist()
CAT_COLS = [c for c in df.drop(columns=[TARGET]).columns if c not in NUM_COLS]

pre = ColumnTransformer([
    ("num", StandardScaler(), NUM_COLS),
    ("cat", OneHotEncoder(handle_unknown="ignore"), CAT_COLS)
])

X = df.drop(columns=[TARGET]); y = df[TARGET].astype(int)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.30, stratify=y, random_state=SEED)
X_val, X_test, y_val, y_test   = train_test_split(X_temp, y_temp, test_size=0.50, stratify=y_temp, random_state=SEED)

from collections import OrderedDict
def fit_eval(pipe, name, Xtr=X_train, ytr=y_train, Xv=X_val, yv=y_val):
    pipe.fit(Xtr, ytr)
    y_score = pipe.predict_proba(Xv)[:,1] if hasattr(pipe.named_steps["clf"],"predict_proba") else pipe.decision_function(Xv)
    y_pred  = (y_score >= 0.5).astype(int)
    return pipe, OrderedDict(
        model=name,
        roc_auc_val=roc_auc_score(yv, y_score),
        f1_val=f1_score(yv, y_pred),
        precision_val=precision_score(yv, y_pred, zero_division=0),
        recall_val=recall_score(yv, y_pred)
    )

val_rows = []; fitted = {}


# Baselines (LR, DT) + quick visuals

In [5]:
# Logistic Regression
lr_pipe = Pipeline([("pre", pre), ("clf", LogisticRegression(max_iter=1000, class_weight="balanced", random_state=SEED))])
lr_pipe, m = fit_eval(lr_pipe, "LogisticRegression")
fitted["LogisticRegression"] = lr_pipe; val_rows.append(m)

# Decision Tree (small sweep)
best_dt, best_m_dt = None, None
for d in [3,5,10,None]:
    dt = Pipeline([("pre", pre), ("clf", DecisionTreeClassifier(max_depth=d, random_state=SEED))])
    dt, mdict = fit_eval(dt, f"DecisionTree(max_depth={d})")
    if best_m_dt is None or mdict["roc_auc_val"] > best_m_dt["roc_auc_val"]:
        best_dt, best_m_dt = dt, mdict
fitted["DecisionTree"] = best_dt; val_rows.append(best_m_dt)

val_df = pd.DataFrame(val_rows).sort_values("roc_auc_val", ascending=False)
val_df.to_csv(OUT_TAB / "validation_metrics.csv", index=False)
val_df

# Confusion matrix (validation) for LR
pipe = fitted["LogisticRegression"]
ys = pipe.predict_proba(X_val)[:,1] if hasattr(pipe.named_steps["clf"],"predict_proba") else pipe.decision_function(X_val)
yp = (ys >= 0.5).astype(int)
cm = confusion_matrix(y_val, yp)
fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm, display_labels=[0,1]).plot(ax=ax, colorbar=False, values_format="d")
ax.set_title("Validation Confusion Matrix — LogisticRegression")
fig.tight_layout(); fig.savefig(OUT_FIG / "confusion_matrix_val_logreg.png"); plt.close(fig)

# Top |coef| for LR
pre_fit = pre.fit(X_train, y_train)
try:  cat_names = list(pre_fit.named_transformers_["cat"].get_feature_names_out(CAT_COLS))
except: cat_names = [f"{c}_{i}" for i,c in enumerate(CAT_COLS)]
feat_names = np.array(list(NUM_COLS) + cat_names)

clf_lr = fitted["LogisticRegression"].named_steps["clf"]
if hasattr(clf_lr,"coef_"):
    coefs = np.ravel(clf_lr.coef_); order = np.argsort(np.abs(coefs))[::-1][:25]
    fig, ax = plt.subplots(figsize=(6,8))
    ax.barh(range(len(order)), np.abs(coefs[order])[::-1])
    ax.set_yticks(range(len(order))); ax.set_yticklabels(feat_names[order][::-1])
    ax.set_title("Top |Coefficient| — Logistic Regression"); ax.set_xlabel("Absolute Weight")
    fig.tight_layout(); fig.savefig(OUT_FIG / "lr_top_coefs.png"); plt.close(fig)


# Ensembles (RF & GB) + feature importance

In [6]:
# Random Forest (coarse CV)
rf = RandomForestClassifier(random_state=SEED, n_estimators=300, n_jobs=-1)
rf_pipe = Pipeline([("pre", pre), ("clf", rf)])
rf_cv = GridSearchCV(rf_pipe, {"clf__max_depth":[None,10,20], "clf__min_samples_split":[2,5,10]},
                     scoring="roc_auc", cv=5, n_jobs=-1)
rf_cv.fit(X_train, y_train)
rf_best = rf_cv.best_estimator_
ys = rf_best.predict_proba(X_val)[:,1] if hasattr(rf_best.named_steps["clf"],"predict_proba") else rf_best.decision_function(X_val)
yp = (ys >= 0.5).astype(int)
val_rows.append(dict(model="RandomForest",
                     roc_auc_val=roc_auc_score(y_val, ys),
                     f1_val=f1_score(y_val, yp),
                     precision_val=precision_score(y_val, yp, zero_division=0),
                     recall_val=recall_score(y_val, yp)))
fitted["RandomForest"] = rf_best

# Feature importance (RF)
clf = rf_best.named_steps["clf"]
if hasattr(clf, "feature_importances_"):
    imps = np.asarray(clf.feature_importances_)
    order = np.argsort(imps)[::-1][:20]
    fig, ax = plt.subplots(figsize=(6,8))
    ax.barh(range(len(order)), imps[order][::-1])
    ax.set_yticks(range(len(order))); ax.set_yticklabels(feat_names[order][::-1])
    ax.set_title("Top Feature Importance — Random Forest"); ax.set_xlabel("Importance")
    fig.tight_layout(); fig.savefig(OUT_FIG / "feature_importance_top20.png"); plt.close(fig)
    
# Gradient Boosting (optional, keep if it wins)
try:
    gb = GradientBoostingClassifier(random_state=SEED)
    gb_pipe = Pipeline([("pre", pre), ("clf", gb)])
    gb_cv = GridSearchCV(gb_pipe, {"clf__learning_rate":[0.05,0.1], "clf__n_estimators":[100,200], "clf__max_depth":[3]},
                         scoring="roc_auc", cv=5, n_jobs=-1)
    gb_cv.fit(X_train, y_train)
    gb_best = gb_cv.best_estimator_
    ys = gb_best.predict_proba(X_val)[:,1] if hasattr(gb_best.named_steps["clf"],"predict_proba") else gb_best.decision_function(X_val)
    yp = (ys >= 0.5).astype(int)
    val_rows.append(dict(model="GradientBoosting",
                         roc_auc_val=roc_auc_score(y_val, ys),
                         f1_val=f1_score(y_val, yp),
                         precision_val=precision_score(y_val, yp, zero_division=0),
                         recall_val=recall_score(y_val, yp)))
    fitted["GradientBoosting"] = gb_best
except Exception as e:
    print("GB skipped:", e)

val_df = pd.DataFrame(val_rows).sort_values("roc_auc_val", ascending=False)
val_df.to_csv(OUT_TAB / "validation_metrics.csv", index=False)
val_df


Unnamed: 0,model,roc_auc_val,f1_val,precision_val,recall_val
3,GradientBoosting,0.920791,0.616279,0.691304,0.555944
2,RandomForest,0.91459,0.603696,0.731343,0.513986
1,DecisionTree(max_depth=5),0.904419,0.576613,0.680952,0.5
0,LogisticRegression,0.897592,0.599144,0.506024,0.734266


In [7]:
# MLP + learning curve with bands
mlp = MLPClassifier(hidden_layer_sizes=(64,), activation="relu", alpha=1e-4,
                    early_stopping=True, random_state=SEED, max_iter=200)
mlp_pipe = Pipeline([("pre", pre), ("clf", mlp)])
mlp_pipe, m = fit_eval(mlp_pipe, "MLP(64)")
fitted["MLP"] = mlp_pipe; val_rows.append(m)

val_df = pd.DataFrame(val_rows).sort_values("roc_auc_val", ascending=False)
val_df.to_csv(OUT_TAB / "validation_metrics.csv", index=False)
val_df

# Learning curve (+/– bands)
sizes, train_scores, val_scores = learning_curve(mlp_pipe, X_train, y_train, cv=5, scoring="roc_auc",
                                                 n_jobs=-1, train_sizes=np.linspace(0.2,1.0,5), shuffle=True, random_state=SEED)
tr_m, tr_s = train_scores.mean(axis=1), train_scores.std(axis=1)
va_m, va_s = val_scores.mean(axis=1), val_scores.std(axis=1)
fig, ax = plt.subplots()
ax.plot(sizes, tr_m, marker="o", label="Train"); ax.fill_between(sizes, tr_m-tr_s, tr_m+tr_s, alpha=0.2)
ax.plot(sizes, va_m, marker="o", label="CV");    ax.fill_between(sizes, va_m-va_s, va_m+va_s, alpha=0.2)
ax.set_title("Learning Curve — MLP"); ax.set_xlabel("Train size"); ax.set_ylabel("ROC-AUC"); ax.legend(loc="best")
fig.tight_layout(); fig.savefig(OUT_FIG / "mlp_learning_curve_bands.png"); plt.close(fig)


In [8]:
# Threshold sweep (validation) with chosen marker
val_df = pd.DataFrame(val_rows).sort_values("roc_auc_val", ascending=False)
best_name = val_df.iloc[0]["model"]; best_pipe = fitted[best_name]
y_score_val = best_pipe.predict_proba(X_val)[:,1] if hasattr(best_pipe.named_steps["clf"],"predict_proba") else best_pipe.decision_function(X_val)

ths = np.linspace(0.1, 0.9, 33); rows = []
for t in ths:
    pred = (y_score_val >= t).astype(int)
    rows.append({"threshold": t, "f1": f1_score(y_val,pred),
                 "precision": precision_score(y_val,pred,zero_division=0),
                 "recall": recall_score(y_val,pred)})
thr_df = pd.DataFrame(rows); thr_df.to_csv(OUT_TAB / "threshold_sweep_val.csv", index=False)
best_thr = float(thr_df.sort_values("f1", ascending=False).iloc[0]["threshold"])
(Path(OUT_TAB / "chosen_threshold.json")).write_text(json.dumps({"model": best_name, "threshold": best_thr}, indent=2))

fig, ax = plt.subplots()
ax.plot(thr_df["threshold"], thr_df["precision"], label="Precision")
ax.plot(thr_df["threshold"], thr_df["recall"], label="Recall")
ax.plot(thr_df["threshold"], thr_df["f1"], label="F1")
add_threshold_marker(ax, best_thr, f"chosen t={best_thr:.3f}", ymax=1.02)
ax.set_xlabel("Threshold"); ax.set_ylabel("Score"); ax.set_title(f"Threshold Sweep — {best_name} (Validation)")
ax.legend(loc="best"); fig.tight_layout(); fig.savefig(OUT_FIG / "threshold_sweep_marked.png"); plt.close(fig)

best_name, best_thr

('GradientBoosting', 0.325)

In [9]:
# Test evaluation (ROC, PR w/marker, Confusion %, Calibration/Brier)
best_meta = json.loads((OUT_TAB / "chosen_threshold.json").read_text())
best_name, best_thr = best_meta["model"], best_meta["threshold"]
pipe = fitted[best_name]

y_score_test = pipe.predict_proba(X_test)[:,1] if hasattr(pipe.named_steps["clf"],"predict_proba") else pipe.decision_function(X_test)
y_pred_test  = (y_score_test >= best_thr).astype(int)

pd.DataFrame([{
    "model": best_name, "threshold": best_thr,
    "roc_auc_test": roc_auc_score(y_test, y_score_test),
    "f1_test": f1_score(y_test, y_pred_test),
    "precision_test": precision_score(y_test, y_pred_test, zero_division=0),
    "recall_test": recall_score(y_test, y_pred_test)
}]).to_csv(OUT_TAB / "test_metrics.csv", index=False)

# ROC
fpr, tpr, _ = roc_curve(y_test, y_score_test)
fig, ax = plt.subplots()
ax.plot(fpr, tpr, label=f"AUC={auc(fpr,tpr):.3f}"); ax.plot([0,1],[0,1],"--")
ax.set_title(f"ROC Curve — Test ({best_name})"); ax.set_xlabel("FPR"); ax.set_ylabel("TPR"); ax.legend(loc="lower right")
fig.tight_layout(); fig.savefig(OUT_FIG / "roc_curve_test.png"); plt.close(fig)

# PR with chosen-threshold marker
prec, rec, thr = precision_recall_curve(y_test, y_score_test); ap = average_precision_score(y_test, y_score_test)
thr_idx = np.argmin(np.abs(thr - best_thr)) if len(thr)>0 else 0
rec_at  = rec[thr_idx+1] if len(rec)>thr_idx+1 else rec[-1]
prec_at = prec[thr_idx+1] if len(prec)>thr_idx+1 else prec[-1]
fig, ax = plt.subplots()
ax.plot(rec, prec, label=f"PR (AP={ap:.3f})"); ax.scatter([rec_at], [prec_at])
ax.set_title("Precision–Recall Curve — Test"); ax.set_xlabel("Recall"); ax.set_ylabel("Precision"); ax.legend(loc="lower left")
fig.tight_layout(); fig.savefig(OUT_FIG / "pr_curve_test_marked.png"); plt.close(fig)

# Confusion matrix with % overlay
cm = confusion_matrix(y_test, y_pred_test); cm_norm = cm / cm.sum(axis=1, keepdims=True)
fig, ax = plt.subplots()
ConfusionMatrixDisplay(cm, display_labels=[0,1]).plot(ax=ax, colorbar=False, values_format="d")
ax.set_title("Confusion Matrix — Test")
for (i,j), v in np.ndenumerate(cm_norm):
    ax.text(j, i, f"\n({v*100:.1f}%)", ha="center", va="top")
fig.tight_layout(); fig.savefig(OUT_FIG / "confusion_matrix_test_pretty.png"); plt.close(fig)

# Calibration + Brier
prob_true, prob_pred = calibration_curve(y_test, y_score_test, n_bins=10, strategy="quantile")
fig, ax = plt.subplots()
ax.plot(prob_pred, prob_true, marker="o"); ax.plot([0,1],[0,1],"--")
ax.set_title("Reliability (Calibration) — Test"); ax.set_xlabel("Predicted probability"); ax.set_ylabel("Observed frequency")
brier = brier_score_loss(y_test, y_score_test); ax.text(0.62, 0.08, f"Brier={brier:.3f}", transform=ax.transAxes)
fig.tight_layout(); fig.savefig(OUT_FIG / "calibration_curve_test_brier.png"); plt.close(fig)
(Path(OUT_TAB / "brier_score.txt")).write_text(str(brier))


19

In [10]:
# Interpretability (feature importances/coeffs)
pre_fit = pre.fit(X_train, y_train)
try:  cat_names = list(pre_fit.named_transformers_["cat"].get_feature_names_out(CAT_COLS))
except: cat_names = [f"{c}_{i}" for i,c in enumerate(CAT_COLS)]
feat_names = np.array(list(NUM_COLS) + cat_names)

clf = pipe.named_steps["clf"]
vals = np.asarray(clf.feature_importances_) if hasattr(clf,"feature_importances_") else (np.abs(np.ravel(clf.coef_)) if hasattr(clf,"coef_") else None)
if vals is not None:
    order = np.argsort(vals)[::-1][:25]
    fig, ax = plt.subplots(figsize=(6,8))
    ax.barh(range(len(order)), vals[order][::-1])
    ax.set_yticks(range(len(order))); ax.set_yticklabels(feat_names[order][::-1])
    ax.set_title(f"Top Feature Signals — {best_name}"); ax.set_xlabel("Importance / |Coefficient|")
    fig.tight_layout(); fig.savefig(OUT_FIG / "feature_importance_top25.png"); plt.close(fig)


In [11]:
# Ablations (BehavioralOnly vs Temporal/TechOnly)
BEHAVIORAL = [c for c in ["Administrative","Administrative_Duration","Informational","Informational_Duration",
                          "ProductRelated","ProductRelated_Duration","BounceRates","ExitRates","PageValues","SpecialDay"] if c in X.columns]
TEMPORAL_TECH = [c for c in ["Month","Weekend","OperatingSystems","Browser","Region","TrafficType","VisitorType"] if c in X.columns]

def make_pre(cols_keep):
    num = [c for c in cols_keep if c in NUM_COLS]
    cat = [c for c in cols_keep if c in CAT_COLS]
    return ColumnTransformer([("num", StandardScaler(), num), ("cat", OneHotEncoder(handle_unknown="ignore"), cat)])

def eval_group(cols_keep, label):
    from sklearn.base import clone
    pre_g = make_pre(cols_keep)
    # reuse the best family
    base = fitted[best_name]
    clf = clone(base.named_steps["clf"])
    pipe_g = Pipeline([("pre", pre_g), ("clf", clf)])
    _, m = fit_eval(pipe_g, f"{best_name} | {label}")
    return m

abl_rows = [{"group":"Full","roc_auc_val": float(pd.DataFrame(val_rows).set_index("model").loc[best_name,"roc_auc_val"])}]
if BEHAVIORAL:    abl_rows.append({"group":"BehavioralOnly",   "roc_auc_val": eval_group(BEHAVIORAL, "BehavioralOnly")["roc_auc_val"]})
if TEMPORAL_TECH: abl_rows.append({"group":"TemporalTechOnly","roc_auc_val": eval_group(TEMPORAL_TECH, "TemporalTechOnly")["roc_auc_val"]})

abl_df = pd.DataFrame(abl_rows); abl_df.to_csv(OUT_TAB / "ablations.csv", index=False)
base = float(abl_df.loc[abl_df["group"]=="Full","roc_auc_val"])
abl_df2 = abl_df.copy(); abl_df2["delta_auc"] = abl_df2["roc_auc_val"] - base
ax = abl_df2.set_index("group")["delta_auc"].plot(kind="bar", title="Ablation ΔAUC vs Full")
ax.figure.tight_layout(); ax.figure.savefig(OUT_FIG / "ablation_delta_auc.png"); plt.close(ax.figure)
abl_df


  base = float(abl_df.loc[abl_df["group"]=="Full","roc_auc_val"])


Unnamed: 0,group,roc_auc_val
0,Full,0.920791
1,BehavioralOnly,0.883948
2,TemporalTechOnly,0.703958


# Business visuals: cumulative gains, lift, per-1k; exports

In [13]:
# Export helper funcs
def _save(fig, name):
    p = OUT_FIG / f"{name}.png"
    fig.savefig(p, dpi=300, bbox_inches="tight")
    plt.close(fig)
    return str(p)

def export_all(model, X_test, y_test, feat_names, prefix="purchase_pred", thr=None):
    paths = {}
    # ROC
    y_score = model.predict_proba(X_test)[:,1] if hasattr(model.named_steps["clf"],"predict_proba") else model.decision_function(X_test)
    fpr, tpr, _ = roc_curve(y_test, y_score)
    fig, ax = plt.subplots(); ax.plot(fpr,tpr,label=f"AUC={auc(fpr,tpr):.3f}"); ax.plot([0,1],[0,1],"--")
    ax.set_title("ROC Curve"); ax.set_xlabel("FPR"); ax.set_ylabel("TPR"); ax.legend(loc="lower right")
    paths["roc"] = _save(fig, f"{prefix}_roc_test")
    # PR
    prec, rec, thrv = precision_recall_curve(y_test, y_score); ap = average_precision_score(y_test, y_score)
    fig, ax = plt.subplots(); ax.plot(rec, prec, label=f"AP={ap:.3f}")
    if thr is not None and len(thrv)>0:
        idx = np.argmin(np.abs(thrv - thr)); ax.scatter([rec[idx+1]],[prec[idx+1]])
    ax.set_title("Precision–Recall Curve"); ax.set_xlabel("Recall"); ax.set_ylabel("Precision"); ax.legend(loc="lower left")
    paths["pr"] = _save(fig, f"{prefix}_pr_test")
    # CM
    y_pred = (y_score >= (thr if thr is not None else 0.5)).astype(int)
    cm = confusion_matrix(y_test, y_pred)
    fig, ax = plt.subplots(); ConfusionMatrixDisplay(cm, display_labels=[0,1]).plot(ax=ax, colorbar=False, values_format="d")
    ax.set_title("Confusion Matrix"); paths["cm"] = _save(fig, f"{prefix}_cm_test")
    # Calibration
    prob_true, prob_pred = calibration_curve(y_test, y_score, n_bins=10, strategy="quantile")
    fig, ax = plt.subplots(); ax.plot(prob_pred, prob_true, marker="o"); ax.plot([0,1],[0,1],"--")
    ax.set_title("Reliability (Calibration)"); ax.set_xlabel("Predicted probability"); ax.set_ylabel("Observed frequency")
    ax.text(0.6,0.1,f"Brier={brier_score_loss(y_test, y_score):.3f}", transform=ax.transAxes)
    paths["cal"] = _save(fig, f"{prefix}_calibration_test")
    # Feature importance / coeffs
    clf = model.named_steps["clf"]
    vals = np.asarray(clf.feature_importances_) if hasattr(clf,"feature_importances_") else (np.abs(np.ravel(clf.coef_)) if hasattr(clf,"coef_") else None)
    if vals is not None:
        order = np.argsort(vals)[::-1][:25]
        fig, ax = plt.subplots(figsize=(6,8)); ax.barh(range(len(order)), vals[order][::-1])
        ax.set_yticks(range(len(order))); ax.set_yticklabels(np.array(feat_names)[order][::-1])
        ax.set_title("Top Signals"); ax.set_xlabel("Importance / |Coefficient|")
        paths["feature_importance"] = _save(fig, f"{prefix}_feature_importance")
    (OUT_TAB / f"{prefix}_classification_report.json").write_text(
        json.dumps(classification_report(y_test, y_pred, output_dict=True), indent=2),
        encoding="utf-8"
    )
    return paths

# Names again
pre_fit = pre.fit(X_train, y_train)
try:
    cat_names = list(pre_fit.named_transformers_["cat"].get_feature_names_out(CAT_COLS))
except:
    cat_names = [f"{c}_{i}" for i,c in enumerate(CAT_COLS)]
feat_names = list(NUM_COLS) + cat_names

export_paths = export_all(pipe, X_test, y_test, feat_names, prefix="purchase_pred", thr=best_thr)
(OUT_TAB / "export_paths.json").write_text(
    json.dumps(export_paths, indent=2, ensure_ascii=False),
    encoding="utf-8"
)

# Cumulative gains + lift
order = np.argsort(-y_score_test); y_true_sorted = y_test.to_numpy()[order]
cum_pos = np.cumsum(y_true_sorted); total_pos = y_true_sorted.sum()
percent_samples = np.arange(1, len(y_true_sorted)+1) / len(y_true_sorted)
gain = cum_pos / total_pos
fig, ax = plt.subplots()
ax.plot(percent_samples, gain, label="Model"); ax.plot([0,1],[0,1], "--", label="Random")
ax.set_xlabel("Cumulative share of sessions"); ax.set_ylabel("Cumulative share of buyers captured")
ax.set_title("Cumulative Gains — Test"); ax.legend(loc="lower right")
fig.tight_layout(); fig.savefig(OUT_FIG / "cumulative_gains.png"); plt.close(fig)

decile = max(1, int(0.1*len(y_true_sorted))); buyers_top10 = y_true_sorted[:decile].sum()
lift_top10 = (buyers_top10 / decile) / y_test.mean()
(OUT_TAB / "lift_top10.txt").write_text(
    f"Top-10% lift ≈ {lift_top10:.2f}×",
    encoding="utf-8"
)

# Per-1k sessions text
alerts_rate = (y_pred_test == 1).mean()
tp_rate = ((y_pred_test == 1) & (y_test == 1)).mean()
(Path(OUT_TAB / "per_1k.txt")).write_text(
    f"Per 1,000 sessions → Alerts: {alerts_rate*1000:.0f}, True buyers captured: {tp_rate*1000:.0f}",
    encoding="utf-8"
)


59

In [14]:
# Slice error analysis (where FPs/FNs cluster) 
def slice_error_table(frame, y_true, y_pred, by):
    df_ = pd.DataFrame({"y": y_true, "yhat": y_pred, by: frame[by]})
    g = df_.groupby(by).agg(
        n=("y","size"), pos=("y","sum"),
        fp=("yhat", lambda s: ((s==1) & (df_.loc[s.index,"y"]==0)).sum()),
        fn=("yhat", lambda s: ((s==0) & (df_.loc[s.index,"y"]==1)).sum()),
    )
    g["fp_rate"] = g["fp"] / g["n"]; g["fn_rate"] = g["fn"] / g["n"]
    return g.sort_values("fp_rate", ascending=False)

slices = [c for c in ["TrafficType","Region","Month","VisitorType"] if c in X_test.columns]
for col in slices:
    tab = slice_error_table(X_test, y_test.to_numpy(), y_pred_test, col)
    tab.head(10).to_csv(OUT_TAB / f"slice_errors_{col}.csv")
    top = tab.head(8)
    fig, ax = plt.subplots(); ax.barh(top.index.astype(str), top["fp_rate"]); ax.invert_yaxis()
    ax.set_xlabel("False Positive Rate"); ax.set_title(f"Top FP Slices — {col}")
    fig.tight_layout(); fig.savefig(OUT_FIG / f"slice_fp_{col}.png"); plt.close(fig)


In [15]:
# CV leaderboard with error bars
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=SEED)
rows = []
from sklearn.base import clone
for mname, p in fitted.items():
    scores = cross_val_score(clone(p), X_train, y_train, cv=cv, scoring="roc_auc", n_jobs=-1)
    rows.append({"model": mname, "auc_mean": scores.mean(), "auc_sd": scores.std()})
cv_df = pd.DataFrame(rows).sort_values("auc_mean", ascending=False)
cv_df.to_csv(OUT_TAB / "cv_leaderboard_auc.csv", index=False)
fig, ax = plt.subplots(figsize=(6,4))
ax.barh(cv_df["model"], cv_df["auc_mean"], xerr=cv_df["auc_sd"]); ax.invert_yaxis()
ax.set_xlabel("ROC-AUC (mean ± sd, 5-fold)"); ax.set_title("Validation Leaderboard (with variability)")
fig.tight_layout(); fig.savefig(OUT_FIG / "cv_leaderboard_auc.png"); plt.close(fig)
cv_df


Unnamed: 0,model,auc_mean,auc_sd
3,GradientBoosting,0.93279,0.007533
2,RandomForest,0.930706,0.008746
4,MLP,0.915896,0.007334
1,DecisionTree,0.911799,0.014636
0,LogisticRegression,0.905606,0.010718


In [17]:
# Bundle slide assets (ordered filenames)
REPO_ROOT = NB_DIR if (NB_DIR / "presentations").exists() else (
    NB_DIR.parent if (NB_DIR.parent / "presentations").exists() else NB_DIR
)
ASSET_DIR = REPO_ROOT / "presentations" / "assets"
ASSET_DIR.mkdir(parents=True, exist_ok=True)

ordered = [
    "class_counts.png","cv_leaderboard_auc.png","threshold_sweep_marked.png","roc_curve_test.png",
    "pr_curve_test_marked.png","confusion_matrix_test_pretty.png","calibration_curve_test_brier.png",
    "feature_importance_top25.png","mlp_learning_curve_bands.png","cumulative_gains.png","ablation_delta_auc.png"
]

i = 1
for name in ordered:
    src = OUT_FIG / name
    if src.exists():
        (ASSET_DIR / f"{i:02d}_{name}").write_bytes(src.read_bytes())
        i += 1

print("Assets exported →", ASSET_DIR.resolve())


Assets exported → C:\Users\mttng\Downloads\retail-customer-purchase-prediction\presentations\assets
