# 02_Modeling — Model development, evaluation, and screenshots

This notebook trains multiple models, evaluates them on the test set, saves pipeline artifacts, and generates the evaluation figures (screenshots) required by the project specification.

In [None]:
import pandas as pd
import numpy as np
from pathlib import Path
import joblib
import matplotlib.pyplot as plt
from sklearn.metrics import mean_absolute_error, mean_squared_error

from src.features import build_preprocessing
from src.models.linear_models import linear_regression, ridge_regression, lasso_regression
from src.models.tree_models import decision_tree, random_forest
from src.models.boosting_models import xgboost_model
from src.models.advanced_models import svr_model, mlp_model
from src.evaluate import plot_pred_vs_true, plot_residuals_hist, plot_feature_importance

DATA_DIR = Path('data')
ARTIFACTS = Path('artifacts')
FIG_DIR = Path('reports/figures')
FIG_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(DATA_DIR / 'train.csv')
test = pd.read_csv(DATA_DIR / 'test.csv')
X_train = train[['trip_duration_days','miles_traveled','total_receipts_amount']]
y_train = train['reimbursement_amount']
X_test = test[['trip_duration_days','miles_traveled','total_receipts_amount']]
y_test = test['reimbursement_amount']


## 1. Preprocessing

In [None]:
preproc = build_preprocessing(poly_degree=2)
X_train_t = preproc.fit_transform(X_train)
X_test_t = preproc.transform(X_test)
print('Preprocessing complete. Transformed feature shape:', X_train_t.shape)

## 2. Train models (examples)
At least four model families are trained as required by the specification.

In [None]:
models = {
    'linear': linear_regression(),
    'ridge': ridge_regression(alpha=1.0),
    'rf': random_forest(n_estimators=100),
    'xgb': xgboost_model(n_estimators=100),
    'svr': svr_model(C=1.0),
    'mlp': mlp_model(hidden_layer_sizes=(50,50), max_iter=500)
}

results = {}
for name, model in models.items():
    print(f"Training {name}...")
    model.fit(X_train_t, y_train)
    preds = model.predict(X_test_t)
    mae = mean_absolute_error(y_test, preds)
    rmse = mean_squared_error(y_test, preds, squared=False)
    results[name] = {'mae': mae, 'rmse': rmse}
    # Save pipeline artifact (preproc + model)
    joblib.dump({'preproc': preproc, 'model': model}, ARTIFACTS / f"{name}_pipeline.joblib")
    print(f"{name}: MAE={mae:.4f}, RMSE={rmse:.4f}")

results

## 3. Generate evaluation figures (screenshots)
For each saved pipeline we create: Pred vs True scatter, residual histogram, and feature importance (or placeholder).

In [None]:
for pipeline_file in ARTIFACTS.glob("*_pipeline.joblib"):
    name = pipeline_file.stem.replace("_pipeline", "")
    print('Generating figures for', name)
    data = joblib.load(pipeline_file)
    preproc = data['preproc']
    model = data['model']
    X_t = preproc.transform(X_test)
    preds = model.predict(X_t)
    # Pred vs True
    plot_pred_vs_true(y_test, preds, name)
    # Residuals
    plot_residuals_hist(y_test, preds, name)
    # Feature names from polynomial features if available
    try:
        poly = preproc.named_steps.get('poly')
        base_features = ['trip_duration_days','miles_traveled','total_receipts_amount']
        feat_names = poly.get_feature_names_out(base_features)
    except Exception:
        feat_names = [f"f{i}" for i in range(X_t.shape[1])]
    # Feature importance or coefficients
    if hasattr(model, 'feature_importances_'):
        plot_feature_importance(model.feature_importances_, feat_names, name)
    elif hasattr(model, 'coef_'):
        coef = model.coef_.ravel()
        plot_feature_importance(coef, feat_names, name)
    else:
        plt.figure(figsize=(6,2))
        plt.text(0.5, 0.5, 'No feature importance available', ha='center', va='center')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"{name}_feature_importance.png", dpi=150)
        plt.close()
print('All figures generated in reports/figures/')

## 4. Ensemble (optional)
Example stacking ensemble using RandomForest and XGBoost (or fallback).

In [None]:
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import RidgeCV

estimators = [
    ('rf', random_forest(n_estimators=100)),
    ('xgb', xgboost_model(n_estimators=100))
]
stack = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stack.fit(X_train_t, y_train)
preds = stack.predict(X_test_t)
print('Ensemble MAE =', mean_absolute_error(y_test, preds))
joblib.dump({'preproc': preproc, 'model': stack}, ARTIFACTS / 'ensemble_pipeline.joblib')
plot_pred_vs_true(y_test, preds, 'ensemble')
plot_residuals_hist(y_test, preds, 'ensemble')


## 5. Next steps
- Hyperparameter tuning (GridSearchCV / RandomizedSearchCV)
- Interpretability (SHAP/LIME) and business-rule extraction
- Productionize the best pipeline and validate performance under time constraints