## 5. Hyperparameter optimization

In [3]:
import pandas as pd

data = pd.read_csv('data/StudentPerformanceFactors.csv')

X = data.drop("Exam_Score", axis=1)
Y = data['Exam_Score']

from sklearn.model_selection import train_test_split

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=0.75)

num_cols = X.select_dtypes(include=['int64', 'float64']).columns
cat_cols = X.select_dtypes(include=['object', 'category']).columns

from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler

num_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

from sklearn.preprocessing import OneHotEncoder

cat_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

from sklearn.compose import ColumnTransformer

preprocessor = ColumnTransformer([
    ('num', num_pipeline, num_cols),
    ('cat', cat_pipeline, cat_cols)
])

In [20]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import GridSearchCV

RFR = RandomForestRegressor(random_state=42)
RFR_grid_search = RandomForestRegressor(random_state=42)
KNN = KNeighborsRegressor()

rfr_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RFR)
])

rfr_grid_search_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', RFR_grid_search)
])

rfr_pipeline.fit(X_train, Y_train)

rf_param_grid = {
    'regressor__n_estimators': [170, 200, 220],
    'regressor__max_depth': [12, 15, 18],
    'regressor__min_samples_split': [7, 10, 18],
}

rf_grid = GridSearchCV(rfr_grid_search_pipeline, rf_param_grid, cv=5, scoring='r2', n_jobs=-1)
rf_grid.fit(X_train, Y_train)

print("RandomForest Best Params:", rf_grid.best_params_)

RandomForest Best Params: {'regressor__max_depth': 15, 'regressor__min_samples_split': 10, 'regressor__n_estimators': 220}


In [21]:
y_pred_rf = rfr_pipeline.predict(X_test)
y_pred_rf_grid = rf_grid.predict(X_test)

from sklearn.metrics import r2_score

print(r2_score(Y_train, rfr_pipeline.predict(X_train)))
print(r2_score(Y_test, y_pred_rf))

print(r2_score(Y_train, rf_grid.predict(X_train)))
print(r2_score(Y_test, y_pred_rf_grid))

0.9476122343079263
0.6652971590099943
0.8362169336877522
0.6766308445627984


In [None]:
from sklearn.neighbors import KNeighborsRegressor

KNN = KNeighborsRegressor()
KNN_grid_search = KNeighborsRegressor()

knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNN)
])

knn_grid_search_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('regressor', KNN_grid_search)
])

knn_pipeline.fit(X_train, Y_train)

knn_param_grid = {
    'regressor__n_neighbors': [9, 11, 13, 15, 17],
    'regressor__weights': ['uniform', 'distance'],
    'regressor__p': [1, 2]
}

knn_grid = GridSearchCV(knn_grid_search_pipeline, knn_param_grid, cv=5, scoring='r2', n_jobs=-1)
knn_grid.fit(X_train, Y_train)

print("KNN Best Params:", knn_grid.best_params_)

KNN Best Params: {'regressor__n_neighbors': 13, 'regressor__p': 2, 'regressor__weights': 'distance'}


In [27]:
y_pred_knn = knn_pipeline.predict(X_test)
y_pred_knn_grid = knn_grid.predict(X_test)

print(r2_score(Y_train, knn_pipeline.predict(X_train)))
print(r2_score(Y_test, y_pred_knn))

print(r2_score(Y_train, knn_grid.predict(X_train)))
print(r2_score(Y_test, y_pred_knn_grid))

0.6704570445804028
0.5468836693771812
0.9999999999999978
0.6002838077171371
