In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

df = pd.read_csv("data/preprocessed_student_spending.csv")

# ---------------------
# SELECT FEATURES
# ---------------------

categorical = ["gender", "year_in_school", "major", "preferred_payment_method"]

numerical = [
    "age", "monthly_income", "financial_aid", "tuition", "housing",
    "food", "transportation", "books_supplies", "entertainment",
    "personal_care", "technology", "health_wellness", "miscellaneous"
]

X = df[categorical + numerical]
y = df["overspending"]

# ---------------------
# PREPROCESSING
# ---------------------

preprocess_reg = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categorical),
    ("num", "passthrough", numerical)
])

# ---------------------
# MODEL
# ---------------------

overspending_model = Pipeline([
    ("prep", preprocess_reg),
    ("rf", RandomForestRegressor(n_estimators=935, random_state=42, max_depth=18, max_features='sqrt', min_samples_leaf=2, min_samples_split=6))
])

# ---------------------
# TRAIN/TEST SPLIT
# ---------------------

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

# ---------------------
# TRAIN
# ---------------------

overspending_model.fit(X_train, y_train)

# ---------------------
# PREDICT
# ---------------------

y_pred = overspending_model.predict(X_test)

# ---------------------
# METRICS
# ---------------------

mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print("\n=== OVERSPEEDING REGRESSION MODEL PERFORMANCE ===")
print(f"MAE:  {mae:.3f}")
print(f"RMSE: {rmse:.3f}")
print(f"R²:   {r2:.3f}")




=== OVERSPEEDING REGRESSION MODEL PERFORMANCE ===
MAE:  3.353
RMSE: 28.892
R²:   0.952
