# Model Training Notebook
### Model Development and Evaluation

In [135]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import shap
import joblib
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

RANDOM_STATE = 42
np.random.seed(RANDOM_STATE)

In [136]:
# Load and split data
def load_processed_data():
    """Load and verify processed datasets"""
    try:
        X_train = pd.read_csv('../data/X_train_processed.csv')
        X_test = pd.read_csv('../data/X_test_processed.csv')
        y_train = pd.read_csv('../data/y_train.csv').values.ravel()
        y_test = pd.read_csv('../data/y_test.csv').values.ravel()
        original_df = pd.read_csv('../data/car_purchasing.csv', encoding='ISO-8859-1')
        
        print(" Data loaded successfully")
        print(f"Train shape: {X_train.shape}, Test shape: {X_test.shape}")
        return X_train, X_test, y_train, y_test, original_df
    
    except FileNotFoundError as e:
        print(f" Error: {e}")
        print("Run preprocessing notebook first!")
        raise

X_train, X_test, y_train, y_test, original_df = load_processed_data()


 Data loaded successfully
Train shape: (400, 204), Test shape: (100, 204)


In [137]:
def remove_outliers(X, y):
    """Remove outliers using Z-score filtering with a more lenient threshold"""
    z_scores = np.abs((X - X.mean()) / X.std())
    outlier_mask = (z_scores < 5).all(axis=1)  # Increased threshold from 3 to 5
    filtered_X = X[outlier_mask]
    filtered_y = y[outlier_mask]
    
    if len(filtered_X) == 0:
        print(" Warning: All data points were considered outliers. Using original data.")
        return X, y
    return filtered_X, filtered_y

In [138]:
models = {
    'Linear Regression': make_pipeline(
        StandardScaler(),
        LinearRegression()
    ),
    'Decision Tree': DecisionTreeRegressor(
        max_depth=5,
        random_state=RANDOM_STATE
    ),
    'Random Forest': RandomForestRegressor(
        n_estimators=100,
        random_state=RANDOM_STATE,
        n_jobs=-1
    ),
    'XGBoost': XGBRegressor(
        n_estimators=100,
        random_state=RANDOM_STATE
    ),
    'Neural Network': make_pipeline(
        StandardScaler(),
        MLPRegressor(
            hidden_layer_sizes=(100, 50),
            max_iter=1000,
            early_stopping=True,
            random_state=RANDOM_STATE
        )
    )
}


In [139]:
results = []
print("\n Training Models:")
for name, model in models.items():
    try:
        # Training
        model.fit(X_train_clean, y_train_clean)
        
        # Evaluation
        y_pred = model.predict(X_test)
        metrics = {
            'Model': name,
            'R²': r2_score(y_test, y_pred),
            'MAE': mean_absolute_error(y_test, y_pred),
            'RMSE': np.sqrt(mean_squared_error(y_test, y_pred))
        }
        results.append(metrics)
        print(f" {name:15} | R²: {metrics['R²']:.3f}")
        
    except Exception as e:
        print(f" {name:15} | Error: {str(e)}")

results_df = pd.DataFrame(results)
print("\n📊 Final Performance:")
print(results_df.sort_values('R²', ascending=False))


 Training Models:
 Linear Regression | R²: -777330423364793.125
 Decision Tree   | R²: 0.729
 Random Forest   | R²: 0.948
 XGBoost         | R²: 0.955
 Neural Network  | R²: -0.613

📊 Final Performance:
               Model            R²           MAE          RMSE
3            XGBoost  9.547906e-01  1.609642e+03  2.209389e+03
2      Random Forest  9.479975e-01  1.643472e+03  2.369573e+03
1      Decision Tree  7.286662e-01  4.386714e+03  5.412649e+03
4     Neural Network -6.131107e-01  7.869651e+03  1.319745e+04
0  Linear Regression -7.773304e+14  1.327610e+11  2.897082e+11




In [140]:
print("\n Cross-Validation Results:")
cv_results = []
for name, model in models.items():
    try:
        scores = cross_val_score(
            model, X_train_clean, y_train_clean,
            cv=5, scoring='r2', n_jobs=-1
        )
        cv_results.append({
            'Model': name,
            'CV R² Mean': scores.mean(),
            'CV R² Std': scores.std()
        })
        print(f"{name:15} | CV R²: {scores.mean():.3f} ± {scores.std():.3f}")
    except Exception as e:
        print(f" {name:15} | CV Error: {str(e)}")

cv_df = pd.DataFrame(cv_results)



 Cross-Validation Results:
Linear Regression | CV R²: -22770434587662450688.000 ± 17934942749649610752.000
Decision Tree   | CV R²: 0.703 ± 0.047
Random Forest   | CV R²: 0.923 ± 0.021
XGBoost         | CV R²: 0.928 ± 0.013
Neural Network  | CV R²: -0.165 ± 0.409


In [141]:
plt.figure(figsize=(12, 6))
sns.barplot(x='Model', y='R²', data=results_df)
plt.title('Model Comparison: R² Scores')
plt.xticks(rotation=45)
plt.ylim(0, 1)
plt.tight_layout()
plt.savefig('../reports/figures/model_r2_scores.png', dpi=300)
plt.close()


In [142]:
# Apply outlier removal
X_train_clean, y_train_clean = remove_outliers(X_train, y_train)

# K-means clustering
print("\n👥 Customer Segmentation...")
scaled_features = StandardScaler().fit_transform(X_train_clean[['annual Salary', 'net worth']])
kmeans = KMeans(n_clusters=4, random_state=RANDOM_STATE)
clusters = kmeans.fit_predict(scaled_features)

# Visualization
plt.figure(figsize=(12, 6))
sns.scatterplot(
    x=X_train_clean['annual Salary'],
    y=X_train_clean['net worth'],
    hue=clusters,
    palette='viridis',
    alpha=0.7
)
plt.scatter(
    kmeans.cluster_centers_[:, 0],
    kmeans.cluster_centers_[:, 1],
    s=300, c='red', marker='X'
)
plt.title('Customer Segmentation by Financial Features')
plt.xlabel('Annual Salary')
plt.ylabel('Net Worth')
plt.savefig('../reports/figures/customer_segmentation.png', dpi=300)
plt.close()



👥 Customer Segmentation...


In [143]:
# SHAP values for best model
print("\n SHAP Analysis for Best Model...")
best_model = models['XGBoost']  # Based on previous results
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)

plt.figure(figsize=(12, 6))
shap.summary_plot(shap_values, X_test, plot_type='bar', show=False)
plt.title('Feature Importance (SHAP Values)')
plt.tight_layout()
plt.savefig('../reports/figures/shap_feature_importance.png', dpi=300)
plt.close()


 SHAP Analysis for Best Model...


  shap.summary_plot(shap_values, X_test, plot_type='bar', show=False)


In [144]:

# [Geographic Analysis]
print("\n Geographic Purchase Patterns...")
original_df['Cluster'] = kmeans.predict(
    StandardScaler().fit_transform(original_df[['annual Salary', 'net worth']])
)

plt.figure(figsize=(16, 8))
sns.barplot(
    x='country',
    y='car purchase amount',
    data=original_df,
    estimator=np.median,
    errorbar=None,
    hue='country',
    legend=False
)
plt.xticks(rotation=90)
plt.title('Median Car Purchase Amount by Country')
plt.tight_layout()
plt.savefig('../reports/figures/geo_purchase_patterns.png', dpi=300)
plt.close()



 Geographic Purchase Patterns...


In [145]:
# Save Best Model
print("\n Saving Best Model...")
joblib.dump(best_model, '../models/best_xgboost_model.pkl')
print(" Model saved to models/best_xgboost_model.pkl")





 Saving Best Model...
 Model saved to models/best_xgboost_model.pkl


In [146]:
# %% [Final Report]
print("\n Training Complete!")
print(" Generated Files:")
print(f" - Model performance charts: reports/figures/")
print(f" - Saved model: models/best_xgboost_model.pkl")


 Training Complete!
 Generated Files:
 - Model performance charts: reports/figures/
 - Saved model: models/best_xgboost_model.pkl
