### Training and testing machine learning models.
- Linear regression
- Random forest regressor
- Support vector regression
- Dummy regressor (for comparison)

#### Importing the dataset.

In [1]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

#### Separating features from target.

In [2]:
X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

#### Splitting the data to train set and test set.

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

#### Separating numerical columns and categorical columns.

In [4]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

#### Defining numerical columns preprocessing pipeline.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#### Defining categorical columns preprocessing pipeline.

In [6]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Merging into one column transformer.

In [7]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Finally, defining the full pipeline.

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

LR = LinearRegression()
RF = RandomForestRegressor()
SVR = SVR()
DR = DummyRegressor()


linear_regression_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LR)
])

random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RF)
])

svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR)
])

dummy_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DR)
])


linear_regression_pipeline.fit(X_train, Y_train)
random_forest_pipeline.fit(X_train, Y_train)
svr_pipeline.fit(X_train, Y_train)
dummy_pipeline.fit(X_train, Y_train)

y_pred_lr = linear_regression_pipeline.predict(X_test)
y_pred_rf = random_forest_pipeline.predict(X_test)
y_pred_svr = svr_pipeline.predict(X_test)
y_pred_dr = dummy_pipeline.predict(X_test)

#### Evaluating models.

In [9]:
from src.linear_regression.model_evaluation import metrics_table

df_metrics = metrics_table(Y_test,
                           [y_pred_lr, y_pred_rf, y_pred_svr, y_pred_dr],
                           ['Linear Regression', 'Random forest', 'Support vector regression', 'Dummy regressor'])

display(df_metrics)

Unnamed: 0,R²,MSE,RMSE,MAE
Linear Regression,0.705193,4.527672,2.127833,0.494409
Random forest,0.625452,5.75234,2.398404,1.113432
Support vector regression,0.696654,4.658815,2.158429,0.542062
Dummy regressor,-0.001726,15.384595,3.92232,2.874064


#### Cross validating models.

In [10]:
from sklearn.model_selection import cross_validate

LR_cross_val = cross_validate(linear_regression_pipeline, X_train, Y_train, cv=5,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

RF_cross_val = cross_validate(random_forest_pipeline, X_train, Y_train, cv=5,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

SVR_cross_val = cross_validate(svr_pipeline, X_train, Y_train, cv=5,
                              scoring=('r2', 'neg_mean_squared_error', 'neg_mean_absolute_error'),
                              return_train_score=True)

In [11]:
import numpy as np

models = ['Linear Regression', 'Random Forest', 'Support Vector Regression']
metrics = ['R² (test)', 'R² (train)', 'R² difference', 'MSE (test)', 'MAE (test)']

results = {
    'Linear Regression': [
        np.mean(LR_cross_val['test_r2']),
        np.mean(LR_cross_val['train_r2']),
        np.mean(LR_cross_val['train_r2']) - np.mean(LR_cross_val['test_r2']),
        -np.mean(LR_cross_val['test_neg_mean_squared_error']),
        -np.mean(LR_cross_val['test_neg_mean_absolute_error'])
    ],
    'Random Forest': [
        np.mean(RF_cross_val['test_r2']),
        np.mean(RF_cross_val['train_r2']),
        np.mean(RF_cross_val['train_r2']) - np.mean(RF_cross_val['test_r2']),
        -np.mean(RF_cross_val['test_neg_mean_squared_error']),
        -np.mean(RF_cross_val['test_neg_mean_absolute_error'])
    ],
    'Support Vector Regression': [
        np.mean(SVR_cross_val['test_r2']),
        np.mean(SVR_cross_val['train_r2']),
        np.mean(SVR_cross_val['train_r2']) - np.mean(SVR_cross_val['test_r2']),
        -np.mean(SVR_cross_val['test_neg_mean_squared_error']),
        -np.mean(SVR_cross_val['test_neg_mean_absolute_error'])
    ]
}

df_cross_val = pd.DataFrame(results, index=metrics).T.round(6)

display(df_cross_val)

Unnamed: 0,R² (test),R² (train),R² difference,MSE (test),MAE (test)
Linear Regression,0.73845,0.734962,-0.003488,4.049083,0.496413
Random Forest,0.638883,0.948386,0.309503,5.522657,1.151502
Support Vector Regression,0.728154,0.735017,0.006863,4.206289,0.54637
