## 1. Cross-validation and model evaluation

In [1]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')

X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

from src.linear_regression.models import LinearRegressionClosedForm, LinearRegressionGradientDescent
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR


RF = RandomForestRegressor(random_state=42)
SVR = SVR()
sklearn_LR = LinearRegression()
closed_form_LR = LinearRegressionClosedForm()
gradient_descent_LR = LinearRegressionGradientDescent()

random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RF)
])

svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR)
])

sklearn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', sklearn_LR)
])

closed_form_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', closed_form_LR)
])

gradient_descent_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR)
])

#### Cross-validating models

In [2]:
from sklearn.model_selection import cross_validate, KFold

kfold = KFold(n_splits=3, shuffle=True, random_state=42)

RF_cross_val = cross_validate(random_forest_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

SVR_cross_val = cross_validate(svr_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

sklearn_LR_cross_val = cross_validate(sklearn_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

closed_form_LR_cross_val = cross_validate(closed_form_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

gradient_descent_LR_cross_val = cross_validate(gradient_descent_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

#### Results:

In [3]:
import numpy as np

model_names = ['Random Forest', 'Support Vector Regression', 'Linear Regression (Sklearn)', 'Linear Regression Closed Form', 'Linear Regression Gradient Descent']
cross_vals = [RF_cross_val, SVR_cross_val, sklearn_LR_cross_val, closed_form_LR_cross_val, gradient_descent_LR_cross_val]
metrics = ['R² (test)', 'R² (train)', 'R² difference', 'MSE (test)', 'MAE (test)']

results = {
    name: [
        np.mean(cross_val['test_r2']),
        np.mean(cross_val['train_r2']),
        np.mean(cross_val['train_r2']) - np.mean(cross_val['test_r2']),
        -np.mean(cross_val['test_neg_mean_squared_error']),
        -np.mean(cross_val['test_neg_mean_absolute_error'])
    ]
    for (name, cross_val) in zip(model_names, cross_vals) 
}

df_cross_val = pd.DataFrame(results, index=metrics).T.round(6)

display(df_cross_val)

Unnamed: 0,R² (test),R² (train),R² difference,MSE (test),MAE (test)
Random Forest,0.632661,0.948535,0.315874,5.484364,1.16817
Support Vector Regression,0.729829,0.741449,0.01162,4.050171,0.550935
Linear Regression (Sklearn),0.742899,0.74566,0.00276,3.855203,0.482376
Linear Regression Closed Form,0.742899,0.74566,0.00276,3.855203,0.482376
Linear Regression Gradient Descent,0.74242,0.745396,0.002976,3.862634,0.478878


#### Fitting models

In [4]:
random_forest_pipeline.fit(X_train, Y_train)
svr_pipeline.fit(X_train, Y_train)
sklearn_pipeline.fit(X_train, Y_train)
closed_form_pipeline.fit(X_train, Y_train)
gradient_descent_pipeline.fit(X_train, Y_train)

y_pred_rf = random_forest_pipeline.predict(X_test)
y_pred_svr = svr_pipeline.predict(X_test)
y_pred_skl = sklearn_pipeline.predict(X_test)
y_pred_cf = closed_form_pipeline.predict(X_test)
y_pred_gd = gradient_descent_pipeline.predict(X_test)

#### Evaluating:

In [5]:
from src.linear_regression.model_evaluation import metrics_table

df_metrics = metrics_table(Y_test,
                           [y_pred_rf, y_pred_svr, y_pred_skl, y_pred_cf, y_pred_gd],
                           ['Random forest',
                            'Support vector regression',
                            'Linear Regression (Sklearn)',
                            'Linear Regression Closed Form',
                            'Linear Regression Gradient Descent'])

display(df_metrics)

Unnamed: 0,R²,MSE,RMSE,MAE
Random forest,0.606992,6.299151,2.509811,1.158602
Support vector regression,0.674633,5.214994,2.283636,0.572675
Linear Regression (Sklearn),0.678624,5.15103,2.269588,0.524922
Linear Regression Closed Form,0.678624,5.15103,2.269588,0.524922
Linear Regression Gradient Descent,0.677995,5.161105,2.271807,0.50803
