# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [2]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from time import sleep

from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.max_colwidth', None) 

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [3]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
df_dayofweek = pd.read_csv('../data/dayofweek.csv', usecols=['dayofweek'])

In [4]:
df['dayofweek'] = df_dayofweek
df.head()

Unnamed: 0,numTrials,hour,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,uid_user_15,uid_user_16,uid_user_17,uid_user_18,uid_user_19,uid_user_2,uid_user_20,uid_user_21,uid_user_22,uid_user_23,uid_user_24,uid_user_25,uid_user_26,uid_user_27,uid_user_28,uid_user_29,uid_user_3,uid_user_30,uid_user_31,uid_user_4,uid_user_6,uid_user_7,uid_user_8,labname_code_rvw,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1,dayofweek
0,1,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
1,2,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
2,3,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
3,4,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4
4,5,5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,4


In [5]:
X = df.drop(columns=['dayofweek'])
y = df['dayofweek']

In [6]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [7]:
parameters = {'kernel': ('linear', 'rbf', 'sigmoid'), 'C': (0.01, 0.1, 1, 1.5, 5, 10), 'gamma': ('scale', 'auto'), 'class_weight': ('balanced', None)}

In [8]:
svc = SVC(random_state=21, probability=True)
clf = GridSearchCV(svc, parameters)

In [None]:
clf.fit(X_train, y_train)

In [None]:
clf.best_estimator_

In [None]:
def df_results(model):
    results = pd.DataFrame(model.cv_results_)
    sorted_results = results.sort_values(by='rank_test_score')
    return sorted_results[['params', 'mean_test_score', 'rank_test_score']]

In [None]:
df_results(clf).head()

Unnamed: 0,params,mean_test_score,rank_test_score
70,"{'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}",0.876109,1
64,"{'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}",0.8635,2
58,"{'C': 5, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}",0.816018,3
52,"{'C': 5, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}",0.808608,4
63,"{'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'linear'}",0.721052,5


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
parameters_tree = {'max_depth': range(1, 50), 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')}

In [None]:
tree = DecisionTreeClassifier(random_state=21)
clf_tree = GridSearchCV(tree, parameters_tree)

In [None]:
clf_tree.fit(X_train, y_train)

In [None]:
clf_tree.best_estimator_

In [None]:
df_params_tree = df_results(clf_tree)
df_params_tree.head()

Unnamed: 0,params,mean_test_score,rank_test_score
70,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 22}",0.873121,1
69,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21}",0.873121,2
97,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 49}",0.873116,3
95,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 47}",0.873116,3
94,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 46}",0.873116,3


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
parameters_rf = {'n_estimators': (5, 10, 50, 100), 'max_depth': range(1, 50), 'class_weight': ('balanced', None), 'criterion': ('entropy', 'gini')}

In [None]:
rf = RandomForestClassifier(random_state=21)
clf_rf = GridSearchCV(rf, parameters_rf)

In [None]:
clf_rf.fit(X_train, y_train)

In [None]:
clf_rf.best_estimator_

In [None]:
df_params_forest = df_results(clf_rf)
df_params_forest.head()

Unnamed: 0,params,mean_test_score,rank_test_score
698,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}",0.90429,1
711,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 31, 'n_estimators': 100}",0.903547,2
314,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'n_estimators': 50}",0.902817,3
330,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 34, 'n_estimators': 50}",0.902809,4
702,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 29, 'n_estimators': 50}",0.902806,5


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [None]:
forest = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=28, n_estimators=50, random_state=21)
score = cross_val_score(forest, X_train, y_train, cv=5, n_jobs=-1)

In [None]:
results = []

for params in tqdm(df_params_forest['params']):    
    forest = RandomForestClassifier(**params, random_state=21)
    score = cross_val_score(forest, X_train, y_train, cv=5, n_jobs=-1)
    result = {
        'params': params,
        'mean_accuracy': np.mean(score),
        'std_accuracy': np.std(score)
    }
    results.append(result)
        

  0%|          | 0/784 [00:00<?, ?it/s]

In [None]:
results_df = pd.DataFrame(results)
sorted_results_df = results_df.sort_values(by='mean_accuracy', ascending=False)
sorted_results_df.head()

Unnamed: 0,params,mean_accuracy,std_accuracy
0,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 28, 'n_estimators': 50}",0.90429,0.010961
1,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 31, 'n_estimators': 100}",0.903547,0.01438
2,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 30, 'n_estimators': 50}",0.902817,0.013554
3,"{'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 34, 'n_estimators': 50}",0.902809,0.01301
11,"{'class_weight': None, 'criterion': 'gini', 'max_depth': 42, 'n_estimators': 100}",0.902806,0.01046


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [None]:
best_model = RandomForestClassifier(class_weight=None, criterion='gini', max_depth=28, n_estimators=50, random_state=21)
best_model.fit(X_train, y_train)
best_model.score(X_test, y_test)

0.9289940828402367