## 5. Hyperparameter optimization
For ```Random Forest Regressor``` and ```Support Vector Regression```

In [43]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')

X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

#### Random Forest
- ```n_estimators```
- ```max_depth```
- ```min_samples_split```

In [33]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV

RFR = RandomForestRegressor(random_state=42)
RFR_grid_search = RandomForestRegressor(random_state=42)

rfr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RFR)
])

rfr_grid_search_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RFR_grid_search)
])

rfr_pipeline.fit(X_train, Y_train)

rf_param_grid = {
    'regressor__n_estimators': [100, 200, 300],
    'regressor__max_depth': [15, 20, 25],
    'regressor__min_samples_split': [7, 10, 18],
    'regressor__max_features': ['sqrt', 0.5, 1]
}

rf_grid = GridSearchCV(rfr_grid_search_pipeline, rf_param_grid, cv=5, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, Y_train)

#### Best parameters

In [35]:
print("Random Forest Best Params:")

for key, val in rf_grid.best_params_.items():
    print(f'{key}: {val}')

Random Forest Best Params:
regressor__max_depth: 25
regressor__max_features: 0.5
regressor__min_samples_split: 7
regressor__n_estimators: 300


In [37]:
y_pred_rf = rfr_pipeline.predict(X_test)
y_pred_rf_grid = rf_grid.predict(X_test)

from src.linear_regression.model_evaluation import metrics_table

df_metrics_rf = metrics_table(Y_test,
                              [y_pred_rf, y_pred_rf_grid],
                              ['Random Forest default parameters', 'Random Forest best parameters'])
df_metrics_rf_train = metrics_table(Y_train,
                              [rfr_pipeline.predict(X_train), rf_grid.predict(X_train)],
                              ['Random Forest default parameters', 'Random Forest best parameters'])

print('Train set:')
display(df_metrics_rf_train)
print('Test set:')
display(df_metrics_rf)

Train set:


Unnamed: 0,R²,MSE,RMSE,MAE
Random Forest default parameters,0.94897,0.793763,0.890934,0.434365
Random Forest best parameters,0.867421,2.062237,1.436049,0.59128


Test set:


Unnamed: 0,R²,MSE,RMSE,MAE
Random Forest default parameters,0.637604,5.024619,2.241566,1.123172
Random Forest best parameters,0.664624,4.64998,2.156381,1.042572


#### Support Vector Regression
- ```C```
- ```epsilon```

In [44]:
from sklearn.svm import SVR

SVR_def = SVR()
SVR_grid_search = SVR()

svr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR_def)
])

svr_grid_search_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', SVR_grid_search)
])

svr_pipeline.fit(X_train, Y_train)

svr_param_grid = {
    'regressor__C': [0.1, 1, 5, 10, 20],
    'regressor__epsilon': [0.05, 0.1, 0.5],
    'regressor__gamma': ['scale', 0.01, 0.1]
}

svr_grid = GridSearchCV(svr_grid_search_pipeline, svr_param_grid, cv=5, scoring='r2', n_jobs=-1)
svr_grid.fit(X_train, Y_train)

SVR Best Params: {'regressor__C': 5, 'regressor__epsilon': 0.5, 'regressor__gamma': 0.01}


In [46]:
print("Support Vector Regression Best Params:")

for key, val in svr_grid.best_params_.items():
    print(f'{key}: {val}')

Support Vector Regression Best Params:
regressor__C: 5
regressor__epsilon: 0.5
regressor__gamma: 0.01


In [47]:
y_pred_svr = svr_pipeline.predict(X_test)
y_pred_svr_grid = svr_grid.predict(X_test)

df_metrics_rf = metrics_table(Y_test,
                              [y_pred_svr, y_pred_svr_grid],
                              ['SVR default parameters', 'SVR best parameters'])
df_metrics_rf_train = metrics_table(Y_train,
                              [svr_pipeline.predict(X_train), svr_grid.predict(X_train)],
                              ['SVR default parameters', 'SVR best parameters'])

print('Train set:')
display(df_metrics_rf_train)
print('Test set:')
display(df_metrics_rf)

Train set:


Unnamed: 0,R²,MSE,RMSE,MAE
SVR default parameters,0.719588,4.239073,2.058901,0.413231
SVR best parameters,0.717809,4.265973,2.065423,0.455562


Test set:


Unnamed: 0,R²,MSE,RMSE,MAE
SVR default parameters,0.739594,3.953254,1.988279,0.525853
SVR best parameters,0.747004,3.840773,1.959789,0.441451
