In [1]:
# Auto-prepare data and ensure project root on sys.path (robust)
import sys
import subprocess
from pathlib import Path

def find_project_root(start: Path = None) -> Path:
    if start is None:
        start = Path.cwd()
    cur = start.resolve()
    root = Path(cur.root)
    while True:
        if (cur / 'src').is_dir():
            return cur
        if cur == root:
            return start
        cur = cur.parent

PROJECT_ROOT = find_project_root()
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

DATA_DIR = PROJECT_ROOT / 'data'
train_path = DATA_DIR / 'train.csv'
test_path = DATA_DIR / 'test.csv'

def prepare_data_via_import(json_path: Path):
    import importlib
    try:
        dl = importlib.import_module('src.data_loader')
        dl.prepare_datasets(str(json_path))
        return True, None
    except Exception as e:
        return False, e

def prepare_data_via_subprocess(json_path: Path):
    cmd = [sys.executable, '-m', 'src.data_loader', str(json_path)]
    try:
        subprocess.check_call(cmd)
        return True, None
    except subprocess.CalledProcessError as e:
        return False, e

if not train_path.exists() or not test_path.exists():
    print('data/train.csv or data/test.csv not found. Attempting to create them...')
    json_path = DATA_DIR / 'public_cases.json'
    if not json_path.exists():
        raise FileNotFoundError(f'Expected JSON not found at {json_path}. Please place public_cases.json in data/')
    ok, err = prepare_data_via_import(json_path)
    if not ok:
        print('Import approach failed, trying subprocess approach...')
        ok2, err2 = prepare_data_via_subprocess(json_path)
        if not ok2:
            raise FileNotFoundError(
                'Could not prepare data/train.csv and data/test.csv automatically. ' 
                f'Import error: {err}; Subprocess error: {err2}.\nRun `python -m src.data_loader data/public_cases.json` from the project root.'
            )
    print('Data preparation completed.')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import joblib

ARTIFACTS = PROJECT_ROOT / 'artifacts'
FIG_DIR = PROJECT_ROOT / 'reports' / 'figures'
FIG_DIR.mkdir(parents=True, exist_ok=True)
ARTIFACTS.mkdir(parents=True, exist_ok=True)

train = pd.read_csv(train_path)
test = pd.read_csv(test_path)
X_train = train[['trip_duration_days','miles_traveled','total_receipts_amount']]
y_train = train['reimbursement_amount']
X_test = test[['trip_duration_days','miles_traveled','total_receipts_amount']]
y_test = test['reimbursement_amount']

print('Data loaded. Train shape:', X_train.shape, 'Test shape:', X_test.shape)

Data loaded. Train shape: (750, 3) Test shape: (250, 3)


# 02_Modeling â€” Model development, evaluation, and screenshots

This notebook trains multiple models, evaluates them on the test set, saves pipeline artifacts, and generates the evaluation figures (screenshots) required by the project specification.

In [2]:
from src.features import build_preprocessing
from src.models.linear_models import linear_regression, ridge_regression, lasso_regression
from src.models.tree_models import decision_tree, random_forest
from src.models.boosting_models import xgboost_model
from src.models.advanced_models import svr_model, mlp_model
from src.evaluate import plot_pred_vs_true, plot_residuals_hist, plot_feature_importance
from sklearn.metrics import mean_absolute_error, mean_squared_error
import numpy as np

preproc = build_preprocessing(poly_degree=2)
X_train_t = preproc.fit_transform(X_train)
X_test_t = preproc.transform(X_test)
print('Preprocessing complete. Transformed feature shape:', X_train_t.shape)

Preprocessing complete. Transformed feature shape: (750, 20)


In [3]:
models = {
    'linear': linear_regression(),
    'ridge': ridge_regression(alpha=1.0),
    'rf': random_forest(n_estimators=100),
    'xgb': xgboost_model(n_estimators=100),
    'svr': svr_model(C=1.0),
    'mlp': mlp_model(hidden_layer_sizes=(50,50), max_iter=500)
}

results = {}
for name, model in models.items():
    print(f'Training {name}...')
    try:
        model.fit(X_train_t, y_train)
        preds = model.predict(X_test_t)
        mae = mean_absolute_error(y_test, preds)
        mse = mean_squared_error(y_test, preds)
        rmse = float(np.sqrt(mse))
        results[name] = {'mae': mae, 'rmse': rmse}
        joblib.dump({'preproc': preproc, 'model': model}, ARTIFACTS/f"{name}_pipeline.joblib")
        print(f"{name}: MAE={mae:.4f}, RMSE={rmse:.4f}")
    except Exception as e:
        print(f'Model {name} failed: {e}')
        results[name] = {'error': str(e)}

results

Training linear...
linear: MAE=117.8921, RMSE=170.5083
Training ridge...
ridge: MAE=118.0830, RMSE=169.9742
Training rf...


rf: MAE=80.3960, RMSE=138.5883
Training xgb...
xgb: MAE=87.8625, RMSE=147.4448
Training svr...


svr: MAE=307.3183, RMSE=406.8359
Training mlp...


mlp: MAE=100.2183, RMSE=154.7588




{'linear': {'mae': 117.89206460704962, 'rmse': 170.50833741254093},
 'ridge': {'mae': 118.0830156473711, 'rmse': 169.97423723808},
 'rf': {'mae': 80.3960492, 'rmse': 138.58827349731325},
 'xgb': {'mae': 87.86254806396485, 'rmse': 147.44480892379417},
 'svr': {'mae': 307.3183143146941, 'rmse': 406.8358724488405},
 'mlp': {'mae': 100.2182808513316, 'rmse': 154.75876065554516}}

In [4]:
for pipeline_file in ARTIFACTS.glob("*_pipeline.joblib"):
    name = pipeline_file.stem.replace("_pipeline", "")
    print('Generating figures for', name)
    data = joblib.load(pipeline_file)
    preproc = data['preproc']
    model = data['model']
    X_t = preproc.transform(X_test)
    preds = model.predict(X_t)
    plot_pred_vs_true(y_test, preds, name)
    plot_residuals_hist(y_test, preds, name)
    try:
        poly = preproc.named_steps.get('poly')
        base_features = ['trip_duration_days','miles_traveled','total_receipts_amount']
        feat_names = poly.get_feature_names_out(base_features)
    except Exception:
        feat_names = [f'f{i}' for i in range(X_t.shape[1])]
    if hasattr(model, 'feature_importances_'):
        plot_feature_importance(model.feature_importances_, feat_names, name)
    elif hasattr(model, 'coef_'):
        coef = model.coef_.ravel()
        plot_feature_importance(coef, feat_names, name)
    else:
        import matplotlib.pyplot as plt
        plt.figure(figsize=(6,2))
        plt.text(0.5, 0.5, 'No feature importance available', ha='center', va='center')
        plt.axis('off')
        plt.tight_layout()
        plt.savefig(FIG_DIR / f"{name}_feature_importance.png", dpi=150)
        plt.close()
print('All figures generated in reports/figures/')

Generating figures for linear


Generating figures for mlp


Generating figures for rf


Generating figures for ridge


Generating figures for svr


Generating figures for xgb


All figures generated in reports/figures/
