In [2]:
# === Student Performance: All Features -> Predict G3 (CV Comparison) ===
import pandas as pd
import numpy as np

from sklearn.model_selection import KFold, cross_validate
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import make_scorer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

from sklearn.linear_model import LinearRegression, Ridge
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor

# Try XGBoost if available (optional)
try:
    from xgboost import XGBRegressor
    HAS_XGB = True
except Exception:
    HAS_XGB = False

# -----------------------------
# 1) Load data  (Portuguese set)
# -----------------------------
# If you're using the merged/cleaned file, change the path accordingly.
df = pd.read_csv("student-por.csv", sep=";")

# Target and features
target = "G3"
y = df[target]
X = df.drop(columns=[target])

# Column types
cat_cols = X.select_dtypes(include=["object"]).columns.tolist()
num_cols = X.select_dtypes(exclude=["object"]).columns.tolist()

# -----------------------------
# 2) Preprocessing
#    - OneHot for categoricals
#    - Scale numerics (with_mean=False to work with sparse design)
# -----------------------------
numeric_tf = Pipeline(steps=[
    ("scaler", StandardScaler(with_mean=False))
])

categorical_tf = OneHotEncoder(handle_unknown="ignore")

preprocess = ColumnTransformer(
    transformers=[
        ("num", numeric_tf, num_cols),
        ("cat", categorical_tf, cat_cols),
    ],
    remainder="drop"
)

# -----------------------------
# 3) Models to compare
# -----------------------------
models = {
    "Linear Regression": LinearRegression(),
    # Full depth Decision Tree (no max_depth)
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(
        n_estimators=400, random_state=42, n_jobs=-1
    ),
    "Gradient Boosting": GradientBoostingRegressor(random_state=42),
}

if HAS_XGB:
    models["XGBoost"] = XGBRegressor(
        n_estimators=500,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_lambda=1.0,
        random_state=42,
        n_jobs=-1
    )

# -----------------------------
# 4) Cross-validation setup
# -----------------------------
cv = KFold(n_splits=5, shuffle=True, random_state=42)

# Built-in scorers: use the negative versions and flip the sign afterward
scoring = {
    "r2": "r2",
    "rmse": "neg_root_mean_squared_error",
    "mae": "neg_mean_absolute_error"
}

# -----------------------------
# 5) Run CV for each model
# -----------------------------
rows = []
for name, model in models.items():
    pipe = Pipeline(steps=[
        ("prep", preprocess),
        ("model", model)
    ])
    cv_results = cross_validate(
        pipe, X, y, cv=cv, scoring=scoring, n_jobs=-1, return_train_score=False
    )

    r2_mean  = np.mean(cv_results["test_r2"])
    rmse_mean = -np.mean(cv_results["test_rmse"])  # flip sign back
    mae_mean  = -np.mean(cv_results["test_mae"])   # flip sign back

    rows.append({
        "Model": name,
        "R2 CV": r2_mean,
        "RMSE CV": rmse_mean,
        "MAE CV": mae_mean
    })

results = pd.DataFrame(rows).sort_values("R2 CV", ascending=False).reset_index(drop=True)

print("=== Model Performance Comparison (5-fold CV) ===")
print(results.to_string(index=False))


=== Model Performance Comparison (5-fold CV) ===
            Model    R2 CV  RMSE CV   MAE CV
    Random Forest 0.843226 1.267536 0.823688
Linear Regression 0.836264 1.296538 0.843733
          XGBoost 0.834640 1.296593 0.833690
Gradient Boosting 0.831899 1.309955 0.838692
    Decision Tree 0.691374 1.773346 1.018187
