# # Model Training Notebook
# ## Model Development and Evaluation

In [None]:

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.cluster import KMeans
import shap
import joblib

# Load processed data
processed_data = pd.read_csv('../data/processed_train.csv')

In [None]:
# Extract features and target
X = processed_data.drop('car purchase amount', axis=1)
y = processed_data['car purchase amount']

# Split into train/test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# Initialize models
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(max_depth=5),
    'Random Forest': RandomForestRegressor(n_estimators=100),
    'XGBoost': XGBRegressor(),
    'Neural Network': MLPRegressor(hidden_layer_sizes=(100, 50))
}

In [None]:
# Model training and evaluation
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    metrics = {
        'Model': name,
        'R²': r2_score(y_test, y_pred),
        'MAE': mean_absolute_error(y_test, y_pred),
        'RMSE': np.sqrt(mean_squared_error(y_test, y_pred))
    }
    results.append(metrics)
    
results_df = pd.DataFrame(results)

In [None]:
# Cross-validation
cv_scores = {}
for name, model in models.items():
    scores = cross_val_score(model, X, y, cv=5, scoring='r2')
    cv_scores[name] = scores.mean()

In [None]:
# Customer segmentation
kmeans = KMeans(n_clusters=3)
clusters = kmeans.fit_predict(X[['annual Salary', 'net worth']])



In [None]:
# Analyze clusters
cluster_analysis = pd.DataFrame({
    'Annual Salary': X['annual Salary'],
    'Net Worth': X['net worth'],
    'Cluster': clusters,
    'Purchase Amount': y
})



In [None]:
# SHAP analysis (for best model)
best_model = RandomForestRegressor(n_estimators=100)
best_model.fit(X_train, y_train)
explainer = shap.TreeExplainer(best_model)
shap_values = explainer.shap_values(X_test)
shap.summary_plot(shap_values, X_test)

# %%
# Save best model
joblib.dump(best_model, '../models/best_model.pkl')