### Training and testing machine learning models.
- Linear regression
- Random forest regressor
- Support vector regression
- Gradient boosting regressor

#### Importing the dataset.

In [1]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6607 entries, 0 to 6606
Data columns (total 20 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   Hours_Studied               6607 non-null   int64 
 1   Attendance                  6607 non-null   int64 
 2   Parental_Involvement        6607 non-null   object
 3   Access_to_Resources         6607 non-null   object
 4   Extracurricular_Activities  6607 non-null   object
 5   Sleep_Hours                 6607 non-null   int64 
 6   Previous_Scores             6607 non-null   int64 
 7   Motivation_Level            6607 non-null   object
 8   Internet_Access             6607 non-null   object
 9   Tutoring_Sessions           6607 non-null   int64 
 10  Family_Income               6607 non-null   object
 11  Teacher_Quality             6529 non-null   object
 12  School_Type                 6607 non-null   object
 13  Peer_Influence              6607 non-null   obje

#### Separating features from target.

In [2]:
X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

#### Splitting the data to train set and test set.

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

#### Separating numerical columns and categorical columns.

In [4]:
num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

#### Defining numerical columns preprocessing pipeline.

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

#### Defining categorical columns preprocessing pipeline.

In [6]:
from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

#### Merging into one column transformer.

In [7]:
from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Finally, defining the full pipeline.

In [8]:
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.dummy import DummyRegressor
from sklearn.svm import SVR

LR = LinearRegression()
RF = RandomForestRegressor()
SVR = SVR()
GB = GradientBoostingRegressor()
DR = DummyRegressor()


linear_regression_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', LR)
])

random_forest_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RF)
])

svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR)
])

gradient_boosting_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', GB)
])

dummy_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', DR)
])


linear_regression_pipeline.fit(X_train, Y_train)
random_forest_pipeline.fit(X_train, Y_train)
svr_pipeline.fit(X_train, Y_train)
gradient_boosting_pipeline.fit(X_train, Y_train)
dummy_pipeline.fit(X_train, Y_train)

#### Evaluating models.

In [None]:
from sklearn.metrics import root_mean_squared_error, r2_score, mean_absolute_error


y_pred_lr = linear_regression_pipeline.predict(X_test)
y_pred_rf = random_forest_pipeline.predict(X_test)
y_pred_svr = svr_pipeline.predict(X_test)
y_pred_gb = gradient_boosting_pipeline.predict(X_test)
y_pred_dr = dummy_pipeline.predict(X_test)

# Linear regression
print('\t', '\033[1m', '\033[96m', 'Linear Regression:', '\033[0m', '\n')
print('Coefficients:', LR.coef_)
print('Intercept:', LR.intercept_)
print()

print("R2:", r2_score(Y_test, y_pred_lr))
print("RMSE:", root_mean_squared_error(Y_test, y_pred_lr))
print("MAE:", mean_absolute_error(Y_test, y_pred_lr))

print('\n\n')

# Random forest
print('\t', '\033[1m', '\033[96m', 'Random forest:', '\033[0m', '\n')

print("R2:", r2_score(Y_test, y_pred_rf))
print("RMSE:", root_mean_squared_error(Y_test, y_pred_rf))
print("MAE:", mean_absolute_error(Y_test, y_pred_rf))

print('\n\n')

# SVR
print('\t', '\033[1m', '\033[96m', 'Support vector regression:', '\033[0m', '\n')

print("R2:", r2_score(Y_test, y_pred_svr))
print("RMSE:", root_mean_squared_error(Y_test, y_pred_svr))
print("MAE:", mean_absolute_error(Y_test, y_pred_svr))

print('\n\n')

# Gradient boosting
print('\t', '\033[1m', '\033[96m', 'Gradient boosting:', '\033[0m', '\n')

print("R2:", r2_score(Y_test, y_pred_gb))  # R²
print("RMSE:", root_mean_squared_error(Y_test, y_pred_gb))
print("MAE:", mean_absolute_error(Y_test, y_pred_gb))

print('\n\n')

# Dummy Regressor
print('\t', '\033[1m', '\033[96m', 'Dummy :P :', '\033[0m', '\n')

print("R2:", r2_score(Y_test, y_pred_dr))  # R²
print("RMSE:", root_mean_squared_error(Y_test, y_pred_dr))
print("MAE:", mean_absolute_error(Y_test, y_pred_dr))

	 [1m [96m Linear Regression: [0m 

Coefficients: [ 1.77395171e+00  2.29267469e+00  6.43564240e-03  7.17054866e-01
  6.34668111e-01  2.12975312e-01  9.88744569e-01 -9.48403852e-01
 -4.03407168e-02  9.91150243e-01 -1.01028263e+00  1.91323838e-02
 -2.79197067e-01  2.79197067e-01  5.38491018e-01 -5.40490412e-01
  1.99939417e-03 -4.88750452e-01  4.88750452e-01  5.96095221e-01
 -5.47147409e-01 -4.89478124e-02  5.15742369e-01 -4.75608309e-01
 -4.01340595e-02 -2.95017407e-02  2.95017407e-02 -5.06825430e-01
  3.39160772e-02  4.72909353e-01  4.80951958e-01 -4.80951958e-01
  1.06908687e-02 -5.02594602e-01  4.91903733e-01 -4.32524049e-01
 -3.51256138e-02  4.67649663e-01  2.88799387e-02 -2.88799387e-02]
Intercept: 66.15535404862901

R2: 0.6796227467844165
RMSE: 2.2585208285071503
MAE: 0.5016386541606497



	 [1m [96m Random forest: [0m 

R2: 0.6009941674619561
RMSE: 2.5204778506548404
MAE: 1.1108474576271188



	 [1m [96m Support vector regression: [0m 

R2: 0.6742233077741366
RMSE: 2.277