In [16]:
import numpy as np
import pandas as pd
import math
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from catboost import CatBoostClassifier, CatBoostRegressor, Pool
from xgboost import XGBClassifier, XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [2]:
df=pd.read_csv('data/stud.csv')

In [3]:
df.head()

Unnamed: 0,gender,race_ethnicity,parental_level_of_education,lunch,test_preparation_course,math_score,reading_score,writing_score
0,female,group B,bachelor's degree,standard,none,72,72,74
1,female,group C,some college,standard,completed,69,90,88
2,female,group B,master's degree,standard,none,90,95,93
3,male,group A,associate's degree,free/reduced,none,47,57,44
4,male,group C,some college,standard,none,76,78,75


In [5]:
X=df.drop(columns=['math_score'],axis=1)
y=df['math_score']

In [6]:
numeric_features=[feature for feature in X.columns if X[feature].dtype != 'O']
categorical_features=[feature for feature in X.columns if X[feature].dtype == 'O']

In [11]:
numeric_transformer = StandardScaler()
oh_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    [
        ("OneHotEncoder", oh_transformer, categorical_features),
        ("StandardScaler", numeric_transformer, numeric_features),
    ]
)

In [12]:
X=preprocessor.fit_transform(X)

In [13]:
X

array([[ 1.        ,  0.        ,  0.        , ...,  1.        ,
         0.19399858,  0.39149181],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         1.42747598,  1.31326868],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.77010859,  1.64247471],
       ...,
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.12547206, -0.20107904],
       [ 1.        ,  0.        ,  0.        , ...,  0.        ,
         0.60515772,  0.58901542],
       [ 1.        ,  0.        ,  0.        , ...,  1.        ,
         1.15336989,  1.18158627]], shape=(1000, 19))

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X,y ,random_state=42, test_size=0.2)

In [17]:
def evaluate_model(y_true, y_pred):
    mae = mean_absolute_error(y_true, y_pred)
    mse = mean_squared_error(y_true, y_pred)
    rmse = np.sqrt(mse)
    r2_square = r2_score(y_true, y_pred)

    return {
        "MAE": mae,
        "MSE": mse,
        "RMSE": rmse,
        "r2_square": r2_square
    }

In [19]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor

# Gradient boosting libraries
from xgboost import XGBRegressor
from catboost import CatBoostRegressor

# (Optional) metrics for evaluation
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [20]:
models = {
    "Linear Regression": LinearRegression(),
    "Lasso": Lasso(max_iter=10000),
    "Ridge": Ridge(),
    "KNN Regressor": KNeighborsRegressor(n_neighbors=5),
    "Decision Tree": DecisionTreeRegressor(random_state=42),
    "Random Forest": RandomForestRegressor(n_estimators=300, random_state=42, n_jobs=-1),
    "XGBoost Regressor": XGBRegressor(
        n_estimators=300, learning_rate=0.1, max_depth=6,
        subsample=0.8, colsample_bytree=0.8, n_jobs=-1, random_state=42,
        tree_method="hist"  # good default on modern CPUs
    ),
    "CatBoost Regressor": CatBoostRegressor(
        iterations=300, depth=6, learning_rate=0.1,
        loss_function="RMSE", random_seed=42, verbose=0
    ),
    "AdaBoost Regressor": AdaBoostRegressor(
        n_estimators=300, learning_rate=0.05, random_state=42
    ),
}

# ---- train & evaluate ----
results = []
fitted_models = {}

for name, model in models.items():
    model.fit(X_train, y_train)                 # fit on TRAIN
    y_pred = model.predict(X_test)              # evaluate on TEST

    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    rmse = np.sqrt(mse)
    r2 = r2_score(y_test, y_pred)

    print(f"\n{name}")
    print(f"  MAE : {mae:.4f}")
    print(f"  MSE : {mse:.4f}")
    print(f"  RMSE: {rmse:.4f}")
    print(f"  R²  : {r2:.4f}")

    results.append({"model": name, "MAE": mae, "MSE": mse, "RMSE": rmse, "R2": r2})
    fitted_models[name] = model

# ---- summary table (sorted by RMSE asc) ----
summary = pd.DataFrame(results).sort_values("RMSE").reset_index(drop=True)
print("\n=== Summary (sorted by RMSE) ===")
print(summary)


Linear Regression
  MAE : 4.2148
  MSE : 29.0952
  RMSE: 5.3940
  R²  : 0.8804

Lasso
  MAE : 5.1579
  MSE : 42.5064
  RMSE: 6.5197
  R²  : 0.8253

Ridge
  MAE : 4.2111
  MSE : 29.0563
  RMSE: 5.3904
  R²  : 0.8806

KNN Regressor
  MAE : 5.6280
  MSE : 52.6388
  RMSE: 7.2553
  R²  : 0.7837

Decision Tree
  MAE : 6.1950
  MSE : 59.5150
  RMSE: 7.7146
  R²  : 0.7554

Random Forest
  MAE : 4.6293
  MSE : 35.9224
  RMSE: 5.9935
  R²  : 0.8524

XGBoost Regressor
  MAE : 4.9947
  MSE : 41.6802
  RMSE: 6.4560
  R²  : 0.8287

CatBoost Regressor
  MAE : 4.5546
  MSE : 35.2279
  RMSE: 5.9353
  R²  : 0.8552

AdaBoost Regressor
  MAE : 4.8359
  MSE : 39.0878
  RMSE: 6.2520
  R²  : 0.8394

=== Summary (sorted by RMSE) ===
                model       MAE        MSE      RMSE        R2
0               Ridge  4.211101  29.056272  5.390387  0.880593
1   Linear Regression  4.214763  29.095170  5.393994  0.880433
2  CatBoost Regressor  4.554586  35.227912  5.935311  0.855231
3       Random Forest  4.629