In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV, KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns
import json
# kfold


In [None]:
df = pd.read_csv('../featured_customer_segmentation_with_clusters.csv')
# 1. Compute your correlations
features = [
    'Income', 'Age', 'Total_Dependents', 'Tenure_Days',
    'Teenhome', 'Kidhome', 'Education',
    'Marital_Together', 'Marital_Single', 'Marital_Divorced', 'Marital_Widow',
    'PurchaseCluster'
]
corr = df[features].corr()

# 2. Create a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# 3. Plot
plt.figure(figsize=(14, 12))
sns.heatmap(
    corr,
    mask=mask,
    annot=True,
    fmt='.2f',
    linewidths=.5,
    square=True,
    cbar_kws={'shrink': .8, 'label': 'Pearson ρ'},
    vmin=-1, vmax=1,
    center=0,
    cmap='vlag'  # a red‑to‑blue diverging map; you could switch back to 'coolwarm' if you prefer
)

# 4. Styling
plt.title('Feature Correlation Matrix (Lower Triangle)', fontsize=18, pad=20)
plt.xticks(rotation=45, ha='right', fontsize=12)
plt.yticks(rotation=0, fontsize=12)

# 5. Interpretation legend
plt.gcf().text(
    0.01, 0.01,
    "■ Strong positive (> 0.7)\n■ Strong negative (< -0.7)\n■ Near zero: weak/no linear relation",
    fontsize=10,
    bbox=dict(facecolor='white', alpha=0.8)
)

plt.tight_layout()
plt.show()

print(corr)


In [None]:




# -- Compute derived feature --
df['weighted_total_dependents'] = df['Kidhome'] * 2 + df['Teenhome']

# -- Feature variants --
feature_sets = {
    'v1': ['Income', 'Age', 'Education', 'Total_Dependents'],
    'v2': ['Income', 'Age', 'Education', 'Teenhome', 'Kidhome'],
    'v3': ['Income', 'Age', 'Education',
           'Marital_Together', 'Marital_Single', 'Marital_Divorced', 'Marital_Widow',
           'Total_Dependents', 'Teenhome', 'Kidhome'],
    'v4': ['Income', 'Age', 'Education', 'Kidhome'],
    'v5': ['Income', 'Age', 'Education', 'weighted_total_dependents']
}

y = df['PurchaseCluster']

# -- Hyperparameter grid for RF --
param_grid_rf = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# -- Cross-validation setup --
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# -- Storage for summary results --
summary = []

# -- Explore both Logistic Regression and RF Classifier for each variant --
for variant, feats in feature_sets.items():
    print(f"\n=== Variant: {variant} | Features: {feats} ===")
    X = df[feats]

    # Split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Scale
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)

    # -- Logistic Regression --
    lr = LogisticRegression(max_iter=1000)
    # CV Accuracy
    lr_cv_acc = cross_val_score(lr, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)
    print(f"LogisticRegression CV Accuracy: {lr_cv_acc.mean() * 100:.2f}% ± {lr_cv_acc.std() * 100:.2f}%")
    # Test-set Accuracy
    lr.fit(X_train_s, y_train)
    lr_preds = lr.predict(X_test_s)
    lr_acc = accuracy_score(y_test, lr_preds) * 100
    print(f"LogisticRegression Test Accuracy: {lr_acc:.2f}%")

    # -- Random Forest Classifier --
    rf = RandomForestClassifier(random_state=42)
    rf_cv_acc = cross_val_score(rf, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)
    print(f"RandomForest CV Accuracy: {rf_cv_acc.mean() * 100:.2f}% ± {rf_cv_acc.std() * 100:.2f}%")

    grid = GridSearchCV(rf, param_grid_rf, cv=5, scoring='accuracy', n_jobs=-1)
    grid.fit(X_train_s, y_train)
    best_rf = grid.best_estimator_
    rf_preds = best_rf.predict(X_test_s)
    rf_acc = accuracy_score(y_test, rf_preds) * 100
    print(f"RandomForest Test Accuracy: {rf_acc:.2f}%")
    print(f"Best RF Params: {grid.best_params_}")

    # Save summary
    summary.append({
        'Variant': variant,
        'LR_CV_Acc': lr_cv_acc.mean() * 100,
        'LR_Test_Acc': lr_acc,
        'RF_CV_Acc': rf_cv_acc.mean() * 100,
        'RF_Test_Acc': rf_acc
    })

# -- Summary DataFrame --
summary_df = pd.DataFrame(summary)
print("\n=== Summary of All Variants (Percentage Accuracy) ===")
print(summary_df)


In [None]:

# -- Log-transform skewed Income --
df['Income'] = np.log1p(df['Income'])

# -- Weighted dependents feature --
df['weighted_total_dependents'] = df['Kidhome'] * 2 + df['Teenhome']

# -- Feature sets (Education is ordinal) --
feature_sets = {
    'v1': ['Income', 'Age', 'Education', 'Total_Dependents'],
    'v2': ['Income', 'Age', 'Education', 'Teenhome', 'Kidhome'],
    'v3': ['Income', 'Age', 'Education',
           'Marital_Together', 'Marital_Single', 'Marital_Divorced', 'Marital_Widow', 'Marital_Married',
           'Total_Dependents'],
    'v4': ['Income', 'Age', 'Education', 'Kidhome'],
    'v5': ['Income', 'Age', 'Education', 'weighted_total_dependents']
}

y = df['PurchaseCluster']

# -- Hyperparameter grids --
param_grid_rf = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

param_grid_lr = {
    'C': [0.01, 0.1, 1, 10],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'liblinear']
}

# -- Stratified Cross-validation --
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

summary = []

for variant, feats in feature_sets.items():
    print(f"\n=== Variant: {variant} | Features: {feats} ===")
    X = df[feats]

    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

    # Scaling
    scaler = StandardScaler()
    X_train_s = scaler.fit_transform(X_train)
    X_test_s = scaler.transform(X_test)

    # -- Logistic Regression (tuned) --
    grid_lr = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_lr, cv=kf, scoring='accuracy', n_jobs=-1)
    grid_lr.fit(X_train_s, y_train)
    best_lr = grid_lr.best_estimator_
    lr_preds = best_lr.predict(X_test_s)
    lr_acc = accuracy_score(y_test, lr_preds) * 100
    lr_cv_acc = cross_val_score(best_lr, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)
    print(f"LogisticRegression CV Accuracy: {lr_cv_acc.mean() * 100:.2f}% ± {lr_cv_acc.std() * 100:.2f}%")
    print(f"LogisticRegression Test Accuracy: {lr_acc:.2f}%")
    print("Best LR Params:", grid_lr.best_params_)

    # Confusion matrix for LR
    print("confusion_matrix for lr:")

    print(confusion_matrix(y_test, lr_preds))

    # -- Random Forest Classifier (tuned) --
    grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)
    grid_rf.fit(X_train_s, y_train)
    best_rf = grid_rf.best_estimator_
    rf_preds = best_rf.predict(X_test_s)
    rf_acc = accuracy_score(y_test, rf_preds) * 100
    rf_cv_acc = cross_val_score(best_rf, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)
    print(f"RandomForest CV Accuracy: {rf_cv_acc.mean() * 100:.2f}% ± {rf_cv_acc.std() * 100:.2f}%")
    print(f"RandomForest Test Accuracy: {rf_acc:.2f}%")
    print("Best RF Params:", grid_rf.best_params_)

    print("confusion_matrix for rf:")
    print(confusion_matrix(y_test, rf_preds))

    # -- Store results --
    summary.append({
        'Variant': variant,
        'LR_CV_Acc': lr_cv_acc.mean() * 100,
        'LR_Test_Acc': lr_acc,
        'RF_CV_Acc': rf_cv_acc.mean() * 100,
        'RF_Test_Acc': rf_acc
    })

# -- Summary Results --
summary_df = pd.DataFrame(summary)
print("\n=== Summary of All Variants (Final) ===")
print(summary_df)


In [None]:

# -- Log-transform skewed Income --
df['Income'] = np.log1p(df['Income'])

# -- Weighted dependents feature --
df['weighted_total_dependents'] = df['Kidhome'] * 2 + df['Teenhome']

# -- v3 Feature set --
features_v3 = [
    'Income', 'Age', 'Education',
    'Marital_Together', 'Marital_Single', 'Marital_Divorced', 'Marital_Widow', 'Marital_Married',
    'Total_Dependents'
]
X = df[features_v3]
y = df['PurchaseCluster']

# -- Hyperparameter grid for Random Forest --
param_grid_rf = {
    'n_estimators': [100, 300],
    'max_depth': [None, 10],
    'min_samples_split': [2, 5],
    'min_samples_leaf': [1, 2]
}

# -- Stratified Cross-validation --
kf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

# -- Train/Test Split --
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# -- Scaling --
scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.transform(X_test)

# -- Random Forest Classifier (tuned) --
grid_rf = GridSearchCV(RandomForestClassifier(random_state=42), param_grid_rf, cv=kf, scoring='accuracy', n_jobs=-1)
grid_rf.fit(X_train_s, y_train)
best_rf = grid_rf.best_estimator_
rf_preds = best_rf.predict(X_test_s)
rf_acc = accuracy_score(y_test, rf_preds) * 100
rf_cv_acc = cross_val_score(best_rf, scaler.fit_transform(X), y, scoring='accuracy', cv=kf)

print(f"RandomForest CV Accuracy: {rf_cv_acc.mean() * 100:.2f}% ± {rf_cv_acc.std() * 100:.2f}%")
print(f"RandomForest Test Accuracy: {rf_acc:.2f}%")
print("Best RF Params:", grid_rf.best_params_)

print("confusion_matrix for rf:")
print(confusion_matrix(y_test, rf_preds))

# -- Summary Results --
summary = [{
    'Variant': 'v3',
    'RF_CV_Acc': rf_cv_acc.mean() * 100,
    'RF_Test_Acc': rf_acc
}]
summary_df = pd.DataFrame(summary)
print("\n=== Summary for v3 Random Forest ===")
print(summary_df)



In [None]:
import os, joblib
path = f"{os.getcwd()}/model"
os.makedirs(path, exist_ok=True)

# Save the trained model
joblib.dump(best_rf, f"{path}/random_forest_v3_model.joblib")

# Save the scaler
joblib.dump(scaler, f"{path}/scaler_v3.joblib")

metadata = {
    'features': features_v3,
    'target': 'PurchaseCluster',
    'model_type': 'Random Forest',
    'hyperparameters': grid_rf.best_params_,
    'accuracy': rf_acc,
    'cv_accuracy': rf_cv_acc.mean() * 100
}
# Save metadata
with open(f"{path}/model_metadata_v3.json", 'w') as f:

    json.dump(metadata, f, indent=4)
