### Training and testing machine learning models.
- Linear regression
- Random forest regressor
- Support vector regression
- Dummy regressor (for comparison)

#### Importing the dataset.

In [20]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

#### Separating features from target.

In [21]:
X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

#### Splitting the data to train set and test set.

In [22]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

#### Separating numerical columns and categorical columns.

In [23]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

#### Defining numerical columns preprocessing pipeline.

In [24]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#### Defining categorical columns preprocessing pipeline.

In [25]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Merging into one column transformer.

In [26]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Finally, defining the full pipeline.

In [27]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

LR = LinearRegression()
RF = RandomForestRegressor()
SVR = SVR()
DR = DummyRegressor()


linear_regression_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LR)
])

random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RF)
])

svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR)
])

dummy_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DR)
])


linear_regression_pipeline.fit(X_train, Y_train)
random_forest_pipeline.fit(X_train, Y_train)
svr_pipeline.fit(X_train, Y_train)
dummy_pipeline.fit(X_train, Y_train)

y_pred_lr = linear_regression_pipeline.predict(X_test)
y_pred_rf = random_forest_pipeline.predict(X_test)
y_pred_svr = svr_pipeline.predict(X_test)
y_pred_dr = dummy_pipeline.predict(X_test)

#### Evaluating models.

In [28]:
from src.linear_regression.model_evaluation import metrics_table

df_metrics = metrics_table(Y_test,
                           [y_pred_lr, y_pred_rf, y_pred_svr, y_pred_dr],
                           ['Linear Regression', 'Random forest', 'Support vector regression', 'Dummy regressor'])

print("Evaluating models:")
display(df_metrics)

Evaluating models:


Unnamed: 0,R2,RMSE,MAE
Linear Regression,0.682049,2.164912,0.504253
Random forest,0.571858,2.512198,1.217137
Support vector regression,0.672634,2.196731,0.556853
Dummy regressor,-0.000427,3.84019,2.819673
