# Baseline Models

Train RandomForest and XGBoost baselines on the feature matrix and evaluate.

In [None]:
# ---- Project path setup (DO NOT REMOVE) ----
import sys
from pathlib import Path

PROJECT_ROOT = Path().resolve().parents[0]
if str(PROJECT_ROOT) not in sys.path:
    sys.path.insert(0, str(PROJECT_ROOT))

print("Project root:", PROJECT_ROOT)


In [None]:
import numpy as np
import pandas as pd
import xgboost as xgb

from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor


In [None]:
features_dir = PROJECT_ROOT / "data" / "features"

X_train = np.load(features_dir / "X_train.npy")
X_test  = np.load(features_dir / "X_test.npy")
y_train = np.load(features_dir / "y_train.npy")
y_test  = np.load(features_dir / "y_test.npy")

X_train.shape, X_test.shape


In [None]:
def evaluate_regression(model, X_train, X_test, y_train, y_test):
    y_pred_train = model.predict(X_train)
    y_pred_test = model.predict(X_test)

    metrics = {
        "train_MAE": mean_absolute_error(y_train, y_pred_train),
        "test_MAE": mean_absolute_error(y_test, y_pred_test),
        "train_RMSE": mean_squared_error(y_train, y_pred_train, squared=False),
        "test_RMSE": mean_squared_error(y_test, y_pred_test, squared=False),
        "train_R2": r2_score(y_train, y_pred_train),
        "test_R2": r2_score(y_test, y_pred_test),
    }
    return metrics


In [None]:
linreg = LinearRegression()
linreg.fit(X_train, y_train)

metrics_lr = evaluate_regression(
    linreg, X_train, X_test, y_train, y_test
)

metrics_lr


In [None]:
ridge = Ridge(alpha=1.0)
ridge.fit(X_train, y_train)

metrics_ridge = evaluate_regression(
    ridge, X_train, X_test, y_train, y_test
)

metrics_ridge


In [None]:
rf = RandomForestRegressor(
    n_estimators=300,
    random_state=42
)

rf.fit(X_train, y_train)

metrics_rf = evaluate_regression(
    rf, X_train, X_test, y_train, y_test
)

metrics_rf


In [None]:
xgb_model = xgb.XGBRegressor(
    n_estimators=500,
    learning_rate=0.05,
    max_depth=4,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42
)

xgb_model.fit(X_train, y_train)

metrics_xgb = evaluate_regression(
    xgb_model, X_train, X_test, y_train, y_test
)

metrics_xgb


In [None]:
results = pd.DataFrame.from_dict(
    {
        "Linear": metrics_lr,
        "Ridge": metrics_ridge,
        "RandomForest": metrics_rf,
        "XGBoost": metrics_xgb if "metrics_xgb" in locals() else None,
    },
    orient="index"
)

results


In [None]:
import matplotlib.pyplot as plt

importances = rf.feature_importances_

plt.figure(figsize=(6,4))
plt.bar(range(len(importances)), importances)
plt.xlabel("Feature index")
plt.ylabel("Importance")
plt.title("Random Forest feature importance")
plt.show()
