## All models compared to each other

In [84]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')

X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

### Models
#### Linear Regression

In [85]:
from sklearn.linear_model import LinearRegression

sklearn_LR_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

sklearn_LR_pipeline.fit(X_train, Y_train)

y_pred_lr_skl = sklearn_LR_pipeline.predict(X_test)

names = ['Linear Regression (sklearn)']
y_predictions = [y_pred_lr_skl]

#### Linear Regression less features (12)

In [86]:
from sklearn.feature_selection import SelectKBest, f_regression

small_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('selector', SelectKBest(score_func=f_regression, k=12)),
    ('regressor', LinearRegression())
])

small_pipeline.fit(X_train, Y_train)

y_pred_small = small_pipeline.predict(X_test)

names.append('Linear Regression SelectKBest')
y_predictions.append(y_pred_small)

#### Linear Regression Polynomial Features

In [87]:
from sklearn.preprocessing import PolynomialFeatures

poly_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('poly', PolynomialFeatures(degree=2)),
    ('regressor', LinearRegression())
])

poly_pipeline.fit(X_train, Y_train)

y_pred_poly = poly_pipeline.predict(X_test)

names.append('Polynomial Features')
y_predictions.append(y_pred_poly)

#### Lasso

In [88]:
from sklearn.linear_model import Lasso

L1_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Lasso())
])

from sklearn.model_selection import GridSearchCV
import numpy as np

L1_param_grid = {
    'regressor__alpha': np.logspace(-5, -1, 9)
}

l1_grid = GridSearchCV(L1_pipeline, L1_param_grid, cv=5, scoring='r2', n_jobs=-1)
l1_grid.fit(X_train, Y_train)

print("Lasso Best Params:", l1_grid.best_params_)

y_pred_l1 = l1_grid.predict(X_test)

names.append('Lasso')
y_predictions.append(y_pred_l1)

Lasso Best Params: {'regressor__alpha': np.float64(0.0031622776601683794)}


#### Ridge

In [89]:
from sklearn.linear_model import Ridge

L2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', Ridge())
])

L2_param_grid = {
    'regressor__alpha': np.linspace(1, 10, 10)
}

l2_grid = GridSearchCV(L2_pipeline, L2_param_grid, cv=5, scoring='r2', n_jobs=-1)
l2_grid.fit(X_train, Y_train)

print("Ridge Best Params:", l2_grid.best_params_)

y_pred_l2 = l2_grid.predict(X_test)

names.append('Ridge')
y_predictions.append(y_pred_l2)

Ridge Best Params: {'regressor__alpha': np.float64(10.0)}


#### Linear Regression closed form solution

In [90]:
from src.linear_regression.models import LinearRegressionClosedForm

closed_form_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegressionClosedForm())
])

closed_form_pipeline.fit(X_train, Y_train)

y_pred_lr_cf = closed_form_pipeline.predict(X_test)

names.append('Closed Form Solution')
y_predictions.append(y_pred_lr_cf)

#### Linear Regression closed form solution WITH L2 regularization

In [91]:
cf_LR_L2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegressionClosedForm(regularization=True, alpha=0.1))
])

cf_LR_L2_pipeline.fit(X_train, Y_train)

y_pred_cf_l1 = cf_LR_L2_pipeline.predict(X_test)

names.append('Closed Form Solution with L2')
y_predictions.append(y_pred_cf_l1)

#### Linear Regression gradient descent

In [92]:
from src.linear_regression.models import LinearRegressionGradientDescent

gradient_descent_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegressionGradientDescent())
])

gradient_descent_pipeline.fit(X_train, Y_train)

y_pred_lr_gd = gradient_descent_pipeline.predict(X_test)

names.append('Gradient Descent')
y_predictions.append(y_pred_lr_gd)

#### Linear Regression gradient descent WITH L1 regularization

In [93]:
gd_l1_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegressionGradientDescent(regularization='l1', alpha=0.005))
])

gd_l1_pipeline.fit(X_train, Y_train)

y_pred_gd_l1 = gd_l1_pipeline.predict(X_test)

names.append('Gradient Descent L1')
y_predictions.append(y_pred_gd_l1)

#### Linear Regression gradient descent WITH L2 regularization

In [94]:
gd_l2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegressionGradientDescent(regularization='l2', alpha=0.0005))
])

gd_l2_pipeline.fit(X_train, Y_train)

y_pred_gd_l2 = gd_l2_pipeline.predict(X_test)

names.append('Gradient Descent L2')
y_predictions.append(y_pred_gd_l2)

#### Linear Regression gradient descent WITH both L1 and L2

In [95]:
gd_en_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LinearRegressionGradientDescent(regularization='elasticnet', alpha=0.001))
])

gd_en_pipeline.fit(X_train, Y_train)

y_pred_gd_en = gd_en_pipeline.predict(X_test)

names.append('Gradient Descent L1 and L2')
y_predictions.append(y_pred_gd_en)

#### Support Vector Regression (grid search)

In [96]:
from sklearn.svm import SVR

SVR_grid_search = SVR()

svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR_grid_search)
])

svr_param_grid = {
    'regressor__C': [0.1, 1, 5, 10, 20],
    'regressor__epsilon': [0.05, 0.1, 0.5],
    'regressor__gamma': ['scale', 0.01, 0.1]
}

svr_grid = GridSearchCV(svr_pipeline, svr_param_grid, cv=5, scoring='r2', n_jobs=-1)
svr_grid.fit(X_train, Y_train)

y_pred_svr_grid = svr_grid.predict(X_test)

In [97]:
print(svr_grid.best_params_)

names.append('Support Vector Regression')
y_predictions.append(y_pred_svr_grid)

{'regressor__C': 5, 'regressor__epsilon': 0.5, 'regressor__gamma': 0.01}


#### Random Forest Regression (grid search)

In [98]:
from sklearn.ensemble import RandomForestRegressor

RFR_grid_search = RandomForestRegressor(random_state=42)

rfr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RFR_grid_search)
])

rf_param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [15, 20, 25],
    'regressor__min_samples_split': [7, 10, 18],
    'regressor__max_features': ['sqrt', 0.5, 1]
}

rf_grid = GridSearchCV(rfr_pipeline, rf_param_grid, cv=5, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, Y_train)

y_pred_rf_grid = rf_grid.predict(X_test)

In [99]:
names.append('Random Forest Regression')
y_predictions.append(y_pred_rf_grid)

print(rf_grid.best_params_)

{'regressor__max_depth': 20, 'regressor__max_features': 0.5, 'regressor__min_samples_split': 7, 'regressor__n_estimators': 300}


### Evaluation

In [100]:
from src.linear_regression.model_evaluation import metrics_table

df_metrics = metrics_table(Y_test, y_predictions, names)
df_sorted = df_metrics.sort_values(by='R²', ascending=False)

display(df_sorted)

Unnamed: 0,R²,MSE,RMSE,MAE
Support Vector Regression,0.770509,3.376619,1.837558,0.426594
Closed Form Solution with L2,0.769846,3.386375,1.840211,0.462682
Closed Form Solution,0.769843,3.386426,1.840224,0.462822
Linear Regression (sklearn),0.769843,3.386426,1.840224,0.462822
Ridge,0.76983,3.386616,1.840276,0.463101
Lasso,0.769824,3.386705,1.8403,0.463921
Gradient Descent,0.769726,3.388145,1.840691,0.455645
Gradient Descent L1 and L2,0.769675,3.388901,1.840897,0.459215
Gradient Descent L1,0.769459,3.392073,1.841758,0.482052
Gradient Descent L2,0.768425,3.407292,1.845885,0.495435


#### Cross-validation

In [101]:
from sklearn.model_selection import cross_validate, KFold

kfold = KFold(n_splits=3, shuffle=True, random_state=42)

estimators = [sklearn_LR_pipeline,
              small_pipeline,
              poly_pipeline,
              l1_grid.best_estimator_,
              l2_grid.best_estimator_,
              closed_form_pipeline,
              cf_LR_L2_pipeline,
              gradient_descent_pipeline,
              gd_l1_pipeline,
              gd_l2_pipeline,
              gd_en_pipeline,
              svr_grid.best_estimator_,
              rf_grid.best_estimator_]

cross_vals = []

for estimator in estimators:
    cross_val = cross_validate(estimator, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)
    cross_vals.append(cross_val)

In [102]:
metrics = ['R² (test)', 'R² (train)', 'R² difference', 'MSE (test)', 'MAE (test)']

results = {
    name: [
        np.mean(cross_val['test_r2']),
        np.mean(cross_val['train_r2']),
        np.mean(cross_val['train_r2']) - np.mean(cross_val['test_r2']),
        -np.mean(cross_val['test_neg_mean_squared_error']),
        -np.mean(cross_val['test_neg_mean_absolute_error'])
    ]
    for (name, cross_val) in zip(names, cross_vals) 
}

df_cross_val = pd.DataFrame(results, index=metrics).T.round(6)
df_sorted_cross_val = df_cross_val.sort_values(by='R² (test)', ascending=False)

display(df_sorted_cross_val)

Unnamed: 0,R² (test),R² (train),R² difference,MSE (test),MAE (test)
Lasso,0.713839,0.714915,0.001077,4.435688,0.517689
Ridge,0.713824,0.714966,0.001142,4.435911,0.516887
Linear Regression (sklearn),0.713811,0.714995,0.001184,4.436119,0.516727
Closed Form Solution with L2,0.713811,0.714995,0.001184,4.436116,0.516537
Closed Form Solution,0.713811,0.714995,0.001184,4.436119,0.516727
Gradient Descent L1 and L2,0.713696,0.714846,0.00115,4.437544,0.531191
Gradient Descent L2,0.713674,0.714886,0.001212,4.437942,0.524966
Gradient Descent L1,0.713612,0.714709,0.001096,4.439915,0.499481
Gradient Descent,0.712859,0.71446,0.001601,4.451293,0.51151
Support Vector Regression,0.712739,0.71248,-0.00026,4.452913,0.469717
