In [1]:
# Model Evaluation for Purchase Cluster Prediction
# This notebook evaluates multiple feature sets and models for predicting purchase clusters.

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import os


In [2]:
# 1. Load Data

df = pd.read_csv('../featured_customer_segmentation_with_clusters.csv')


In [3]:
# 2. Define Feature Sets (tweak as desired)
feature_sets = {
    'v1': ['Income', 'Age', 'Education', 'Total_Dependents'],
    'v2': ['Income', 'Age', 'Education', 'Teenhome', 'Kidhome'],
    'v3': ['Income', 'Age', 'Education',
           'Marital_Together', 'Marital_Single', 'Marital_Divorced', 'Marital_Widow', 'Marital_Married',
           'Total_Dependents'],
    'v4': ['Income', 'Age', 'Education', 'Kidhome'],
    'v5': ['Income', 'Age', 'Education', 'weighted_total_dependents']
}

# Ensure weighted_total_dependents exists
if 'weighted_total_dependents' not in df.columns:
    df['weighted_total_dependents'] = df['Kidhome'] * 2 + df['Teenhome']

# Target
y = df['PurchaseCluster']


In [4]:
# 3. Model Evaluation Loop
param_grid_rf = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}
param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

results = []

for variant, feats in feature_sets.items():
    print(f"\n=== Variant: {variant} | Features: {feats} ===")
    X = df[feats]
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)

    # Logistic Regression
    grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)
    grid_lr.fit(X_train_s, y_train)
    best_lr = grid_lr.best_estimator_
    lr_preds = best_lr.predict(X_test_s)
    lr_acc = accuracy_score(y_test, lr_preds) * 100
    lr_cv_acc = cross_val_score(best_lr, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)
    print(f"LogisticRegression CV Accuracy: {lr_cv_acc.mean() * 100:.2f}% ± {lr_cv_acc.std() * 100:.2f}%")
    print(f"LogisticRegression Test Accuracy: {lr_acc:.2f}%")
    print("Best LR Params:", grid_lr.best_params_)
    print("Confusion Matrix (LR):\n", confusion_matrix(y_test, lr_preds))

    # Random Forest
    grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)
    grid_rf.fit(X_train_s, y_train)
    best_rf = grid_rf.best_estimator_
    rf_preds = best_rf.predict(X_test_s)
    rf_acc = accuracy_score(y_test, rf_preds) * 100
    rf_cv_acc = cross_val_score(best_rf, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)
    print(f"RandomForest CV Accuracy: {rf_cv_acc.mean() * 100:.2f}% ± {rf_cv_acc.std() * 100:.2f}%")
    print(f"RandomForest Test Accuracy: {rf_acc:.2f}%")
    print("Best RF Params:", grid_rf.best_params_)
    print("Confusion Matrix (RF):\n", confusion_matrix(y_test, rf_preds))

    results.append({
        'Variant': variant,
        'LR_CV_Acc': lr_cv_acc.mean() * 100,
        'LR_Test_Acc': lr_acc,
        'RF_CV_Acc': rf_cv_acc.mean() * 100,
        'RF_Test_Acc': rf_acc
    })



=== Variant: v1 | Features: ['Income', 'Age', 'Education', 'Total_Dependents'] ===
LogisticRegression CV Accuracy: 77.10% ± 1.46%
LogisticRegression Test Accuracy: 78.35%
Best LR Params: {'C': 10, 'penalty': 'l2', 'solver': 'lbfgs'}
Confusion Matrix (LR):
 [[178  14   7]
 [ 29  60  17]
 [  3  27 113]]
RandomForest CV Accuracy: 77.54% ± 0.54%
RandomForest Test Accuracy: 80.58%
Best RF Params: {'max_depth': None, 'min_samples_leaf': 2, 'min_samples_split': 2, 'n_estimators': 100}
Confusion Matrix (RF):
 [[180  14   5]
 [ 26  60  20]
 [  2  20 121]]

=== Variant: v2 | Features: ['Income', 'Age', 'Education', 'Teenhome', 'Kidhome'] ===
LogisticRegression CV Accuracy: 77.68% ± 1.64%
LogisticRegression Test Accuracy: 79.69%
Best LR Params: {'C': 1, 'penalty': 'l2', 'solver': 'lbfgs'}
Confusion Matrix (LR):
 [[180  12   7]
 [ 25  62  19]
 [  3  25 115]]
RandomForest CV Accuracy: 78.08% ± 1.39%
RandomForest Test Accuracy: 80.36%
Best RF Params: {'max_depth': 10, 'min_samples_leaf': 1, 'min_sa

In [5]:
# 4. Results Summary
results_df = pd.DataFrame(results)
print("\n=== Summary of All Variants ===")
print(results_df)



=== Summary of All Variants ===
  Variant  LR_CV_Acc  LR_Test_Acc  RF_CV_Acc  RF_Test_Acc
0      v1  77.098214    78.348214  77.544643    80.580357
1      v2  77.678571    79.687500  78.080357    80.357143
2      v3  77.633929    78.794643  77.812500    81.696429
3      v4  76.071429    77.232143  77.187500    79.464286
4      v5  76.830357    77.678571  77.544643    80.803571


In [6]:
# 5. Save Best Model (example: best RF from v3)
best_variant = 'v3'
X = df[feature_sets[best_variant]]
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
best_rf = RandomForestClassifier(**grid_rf.best_params_, random_state=42)
best_rf.fit(X_scaled, y)

os.makedirs('model', exist_ok=True)
joblib.dump(best_rf, 'model/random_forest_v3_model.joblib')
joblib.dump(scaler, 'model/scaler_v3.joblib')

print(f"Best model and scaler for {best_variant} saved.")


Best model and scaler for v3 saved.
