## 3. L1 and L2 regularization
Comparing:
- Linear regression with Lasso and Ridge
- Linear regression using closed form solution with and without regularization
- Linear regression using gradient descent with and without regularization

In [1]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')

X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Lasso and Ridge

In [2]:
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Lasso, Ridge

L1 = Lasso()
L2 = Ridge()
sklearn_LR = LinearRegression()

sklearn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', sklearn_LR)
])

L1_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', L1)
])

L2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', L2)
])

In [3]:
from sklearn.model_selection import GridSearchCV
import numpy as np

L1_param_grid = {
    'regressor__alpha': np.logspace(-5, -1, 9)
}

L2_param_grid = {
    'regressor__alpha': np.linspace(1, 10, 10)
}

l1_grid = GridSearchCV(L1_pipeline, L1_param_grid, cv=5, scoring='r2', n_jobs=-1)
l1_grid.fit(X_train, Y_train)

l2_grid = GridSearchCV(L2_pipeline, L2_param_grid, cv=5, scoring='r2', n_jobs=-1)
l2_grid.fit(X_train, Y_train)

sklearn_pipeline.fit(X_train, Y_train)

print("Lasso Best Params:", l1_grid.best_params_)
print("Ridge Best Params:", l2_grid.best_params_)

y_pred_l1 = l1_grid.predict(X_test)
y_pred_l2 = l2_grid.predict(X_test)
y_pred_skl = sklearn_pipeline.predict(X_test)

KeyboardInterrupt: 

#### Evaluating the models

In [None]:
model_names = ['Lasso (L1)', 'Ridge (L2)', 'Linear Regression']
models = [l1_grid, l2_grid, sklearn_pipeline]
metrics = ['R² (train)', 'R² (test)']

results = {
    name: [
        model.score(X_train, Y_train),
        model.score(X_test, Y_test)
    ]
    for (name, model) in zip(model_names, models) 
}

df_compare = pd.DataFrame(results, index=metrics).T.round(6)

display(df_compare)

Unnamed: 0,R² (train),R² (test)
Lasso (L1),0.736555,0.698352
Ridge (L2),0.736553,0.698365
Linear Regression,0.736563,0.698338


In [73]:
from src.linear_regression.model_evaluation import metrics_table

df_metrics = metrics_table(Y_test,
                           [y_pred_l1, y_pred_l2, y_pred_skl],
                           ['Lasso (L1)', 'Ridge (L2)', 'Linear Regression'])

display(df_metrics)

Unnamed: 0,R²,MSE,RMSE,MAE
Lasso (L1),0.698352,4.665759,2.160037,0.510651
Ridge (L2),0.698365,4.665566,2.159992,0.510828
Linear Regression,0.698338,4.665985,2.160089,0.510609


#### Comparing weights

In [74]:
from src.linear_regression.model_evaluation import weights_table

weights_df = weights_table(data, X,
                           [l1_grid.best_estimator_.named_steps['regressor'],
                            l2_grid.best_estimator_.named_steps['regressor'],
                            sklearn_LR],
                           ['Lasso',
                            'Ridge',
                            'Linear Regression'])

display(weights_df)

Unnamed: 0,Feature,Lasso,Ridge,Linear Regression
0,Hours_Studied,1.77208,1.769784,1.7732
1,Attendance,2.278392,2.275209,2.279449
2,Sleep_Hours,-0.001198,-0.002314,-0.002066
3,Previous_Scores,0.686444,0.686243,0.687438
4,Tutoring_Sessions,0.628561,0.628423,0.629571
5,Physical_Activity,0.181699,0.182305,0.183075
6,Parental_Involvement - Low,1.074642,1.015001,1.022632
7,Parental_Involvement - Medium,-0.905978,-0.958564,-0.966809
8,Parental_Involvement - High,0.0,-0.056436,-0.055823
9,Access_to_Resources - High,1.038871,1.04406,1.051774


### Adding regularization to own models

In [25]:
from src.linear_regression.models import LinearRegressionClosedForm, LinearRegressionGradientDescent

closed_form_LR = LinearRegressionClosedForm()

closed_form_LR_L2 = LinearRegressionClosedForm(regularization=True, alpha=10)

gradient_descent_LR = LinearRegressionGradientDescent()

gradient_descent_LR_L1 = LinearRegressionGradientDescent(n_epochs=400, regularization='l1', alpha=0.1)
gradient_descent_LR_L2 = LinearRegressionGradientDescent(n_epochs=400, regularization='l2', alpha=0.05)
gradient_descent_LR_en = LinearRegressionGradientDescent(n_epochs=400, regularization='elasticnet', alpha=0.1)

closed_form_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', closed_form_LR)
])

cf_LR_L2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', closed_form_LR_L2)
])

gradient_descent_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR)
])

gd_LR_L1_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR_L1)
])

gd_LR_L2_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR_L2)
])

gd_LR_en_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', gradient_descent_LR_en)
])

In [26]:
closed_form_pipeline.fit(X_train, Y_train)
cf_LR_L2_pipeline.fit(X_train, Y_train)

gradient_descent_pipeline.fit(X_train, Y_train)
gd_LR_L1_pipeline.fit(X_train, Y_train)
gd_LR_L2_pipeline.fit(X_train, Y_train)
gd_LR_en_pipeline.fit(X_train, Y_train)

y_pred_cf = closed_form_pipeline.predict(X_test)
y_pred_cf_L2 = cf_LR_L2_pipeline.predict(X_test)

y_pred_gd = gradient_descent_pipeline.predict(X_test)
y_pred_gd_l1 = gd_LR_L1_pipeline.predict(X_test)
y_pred_gd_l2 = gd_LR_L2_pipeline.predict(X_test)
y_pred_gd_en = gd_LR_en_pipeline.predict(X_test)

#### Evaluation

In [27]:
from src.linear_regression.model_evaluation import metrics_table, weights_table

df_metrics_reg = metrics_table(Y_test,
                           [y_pred_cf_L2, y_pred_cf, y_pred_gd_l1, y_pred_gd_l2, y_pred_gd_en, y_pred_gd],
                           ['Closed Form with L2',
                            'Closed for without regularization',
                            'Gradient Descent with L1',
                            'Gradient Descent with L2',
                            'Gradient Descent with both (0.5 ratio)',
                            'Gradient Descent without regularization'])

display(df_metrics_reg)

Unnamed: 0,R²,MSE,RMSE,MAE
Closed Form with L2,0.801613,2.845814,1.686954,0.439471
Closed for without regularization,0.80157,2.846444,1.687141,0.439952
Gradient Descent with L1,0.790589,3.003955,1.733192,0.560361
Gradient Descent with L2,0.795986,2.926537,1.710713,0.511292
Gradient Descent with both (0.5 ratio),0.786651,3.060441,1.749412,0.610154
Gradient Descent without regularization,0.801099,2.853191,1.689139,0.457864


#### Gradient descent weights

In [28]:
models = [gradient_descent_LR_L1, gradient_descent_LR_L2, gradient_descent_LR_en, gradient_descent_LR]
model_names = ['GD with L1', 'GD with L2', 'GD with both', 'GD without regularization']

df_weights_reg = weights_table(data, X, models, model_names)

display(df_weights_reg)

Unnamed: 0,Feature,GD with L1,GD with L2,GD with both,GD without regularization
0,Hours_Studied,1.690812,1.660833,1.64186,1.741937
1,Attendance,2.251869,2.159591,2.143726,2.284531
2,Sleep_Hours,0.020631,0.029896,-0.014428,0.022109
3,Previous_Scores,0.618138,0.655169,0.614051,0.706349
4,Tutoring_Sessions,0.58872,0.58733,0.592465,0.645728
5,Physical_Activity,0.136259,0.163873,0.151584,0.20467
6,Parental_Involvement - Low,1.5981,0.884845,0.825872,4.61008
7,Parental_Involvement - Medium,-0.011944,-0.763781,-0.638031,2.603804
8,Parental_Involvement - High,0.581124,-0.03937,0.008636,3.519718
9,Access_to_Resources - High,1.660191,0.859224,0.732205,4.596141


#### Closed form weights

In [20]:
models = [closed_form_LR_L2, closed_form_LR]
model_names = ['CF with L2', 'CF without regularization']

df_weights_reg = weights_table(data, X, models, model_names)

display(df_weights_reg)

Unnamed: 0,Feature,CF with L2,CF without regularization
0,Hours_Studied,1.747641,1.751455
1,Attendance,2.275048,2.27983
2,Sleep_Hours,0.022701,0.022917
3,Previous_Scores,0.67902,0.680556
4,Tutoring_Sessions,0.633763,0.634958
5,Physical_Activity,0.194235,0.195158
6,Parental_Involvement - Low,1.024121,4.60858
7,Parental_Involvement - Medium,-0.965633,2.600975
8,Parental_Involvement - High,-0.058488,3.5182
9,Access_to_Resources - High,1.007306,4.592214


#### Cross-validation

In [72]:
from sklearn.model_selection import cross_validate, KFold

kfold = KFold(n_splits=3, shuffle=True, random_state=42)

GD_L1_cross_val = cross_validate(gd_LR_L1_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

GD_L2_cross_val = cross_validate(gd_LR_L2_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

GD_en_cross_val = cross_validate(gd_LR_en_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

GD_cross_val = cross_validate(gradient_descent_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

CF_L2_cross_val = cross_validate(cf_LR_L2_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

CF_cross_val = cross_validate(closed_form_pipeline, X_train, Y_train, cv=kfold,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

In [73]:
import numpy as np

model_names = [ 'Closed Form with L2',
                'Closed for without regularization',
                'Gradient Descent with L1',
                'Gradient Descent with L2',
                'Gradient Descent with both (0.5 ratio)',
                'Gradient Descent without regularization']
cross_vals = [CF_L2_cross_val, CF_cross_val, GD_L1_cross_val, GD_L2_cross_val, GD_en_cross_val, GD_cross_val]
metrics = ['R² (test)', 'R² (train)', 'R² difference', 'MSE (test)', 'MAE (test)']

results = {
    name: [
        np.mean(cross_val['test_r2']),
        np.mean(cross_val['train_r2']),
        np.mean(cross_val['train_r2']) - np.mean(cross_val['test_r2']),
        -np.mean(cross_val['test_neg_mean_squared_error']),
        -np.mean(cross_val['test_neg_mean_absolute_error'])
    ]
    for (name, cross_val) in zip(model_names, cross_vals) 
}

df_cross_val = pd.DataFrame(results, index=metrics).T.round(6)

display(df_cross_val)

Unnamed: 0,R² (test),R² (train),R² difference,MSE (test),MAE (test)
Closed Form with L2,0.708449,0.711722,0.003273,4.496935,0.495147
Closed for without regularization,0.7088,0.712169,0.00337,4.491366,0.509652
Gradient Descent with L1,0.708554,0.711767,0.003213,4.495075,0.493478
Gradient Descent with L2,0.708254,0.711763,0.003509,4.499794,0.524779
Gradient Descent with both (0.5 ratio),0.708442,0.711679,0.003237,4.49704,0.520927
Gradient Descent without regularization,0.708564,0.711941,0.003378,4.494927,0.519217
