# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
import numpy as np
from tqdm.notebook import tqdm
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import ParameterGrid, GridSearchCV
from sklearn.model_selection import train_test_split, cross_val_score

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1LjhbMTPnZsZpa1Uj75bCOaFpcfORXF3m/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore).
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/dayofweek-not-scaled.csv')
df

Unnamed: 0,numTrials,hour,dayofweek,uid_user_0,uid_user_1,uid_user_10,uid_user_11,uid_user_12,uid_user_13,uid_user_14,...,labname_lab02,labname_lab03,labname_lab03s,labname_lab05s,labname_laba04,labname_laba04s,labname_laba05,labname_laba06,labname_laba06s,labname_project1
0,1,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
1,2,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
2,3,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,4,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
4,5,5,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1681,9,20,3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1682,6,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1683,7,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1684,8,20,3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [3]:
X = df.drop('dayofweek', axis=1)
y = df['dayofweek']

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=21, stratify=y)

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [5]:
svc = SVC(probability=True)
param_grid = {'C': [0.01, 0.1, 1, 1.5, 5, 10],
              'kernel': ['linear', 'rbf', 'sigmoid'],
              'gamma': ['scale', 'auto'],
              'class_weight': ['balanced', None],
              'random_state': [21]}
gs = GridSearchCV(svc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

GridSearchCV(estimator=SVC(probability=True), n_jobs=-1,
             param_grid={'C': [0.01, 0.1, 1, 1.5, 5, 10],
                         'class_weight': ['balanced', None],
                         'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'rbf', 'sigmoid'],
                         'random_state': [21]},
             scoring='accuracy')

In [6]:
gs.best_params_

{'C': 10,
 'class_weight': None,
 'gamma': 'auto',
 'kernel': 'rbf',
 'random_state': 21}

In [7]:
gs.best_score_

0.8761090458488228

In [8]:
results = pd.DataFrame(gs.cv_results_)
results = results.sort_values('rank_test_score', ascending=False)
results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_C,param_class_weight,param_gamma,param_kernel,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
29,2.252575,0.117662,0.057424,0.004936,1,balanced,auto,sigmoid,21,"{'C': 1, 'class_weight': 'balanced', 'gamma': ...",0.066667,0.070370,0.044444,0.063197,0.055762,0.060088,0.009188,72
17,3.185392,0.145737,0.081751,0.022340,0.1,balanced,auto,sigmoid,21,"{'C': 0.1, 'class_weight': 'balanced', 'gamma'...",0.062963,0.066667,0.062963,0.059480,0.059480,0.062310,0.002678,71
41,1.753924,0.035643,0.051054,0.005890,1.5,balanced,auto,sigmoid,21,"{'C': 1.5, 'class_weight': 'balanced', 'gamma'...",0.066667,0.085185,0.081481,0.078067,0.085502,0.079380,0.006913,70
65,1.432025,0.061603,0.045452,0.002763,10,balanced,auto,sigmoid,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.122222,0.140741,0.129630,0.100372,0.085502,0.115693,0.020052,69
53,1.614948,0.085890,0.045209,0.001640,5,balanced,auto,sigmoid,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.144444,0.148148,0.137037,0.126394,0.092937,0.129792,0.019869,68
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
63,141.181369,20.307216,0.022167,0.000571,10,balanced,auto,linear,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.729630,0.700000,0.755556,0.754647,0.665428,0.721052,0.034438,5
52,1.451135,0.046662,0.127372,0.005122,5,balanced,auto,rbf,21,"{'C': 5, 'class_weight': 'balanced', 'gamma': ...",0.844444,0.785185,0.792593,0.817844,0.802974,0.808608,0.021007,4
58,1.391673,0.039465,0.130286,0.003474,5,,auto,rbf,21,"{'C': 5, 'class_weight': None, 'gamma': 'auto'...",0.825926,0.811111,0.818519,0.821561,0.802974,0.816018,0.008116,3
64,1.549455,0.007265,0.129132,0.007299,10,balanced,auto,rbf,21,"{'C': 10, 'class_weight': 'balanced', 'gamma':...",0.877778,0.851852,0.862963,0.873606,0.851301,0.863500,0.010870,2


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
dtc = DecisionTreeClassifier()
param_grid = {'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}
gs = GridSearchCV(dtc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

GridSearchCV(estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                         'random_state': [21]},
             scoring='accuracy')

In [10]:
gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'gini',
 'max_depth': 21,
 'random_state': 21}

In [11]:
gs.best_score_

0.873864794162192

In [12]:
results2 = pd.DataFrame(gs.cv_results_)
results2 = results2.sort_values('rank_test_score', ascending=False)
results2

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.008123,0.000863,0.004408,0.000435,balanced,gini,1,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.318519,0.266667,0.323420,0.260223,0.286358,0.028376,195
49,0.007525,0.000518,0.004238,0.000922,balanced,entropy,1,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.318519,0.266667,0.323420,0.260223,0.286358,0.028376,195
147,0.007038,0.000729,0.003991,0.000108,,entropy,1,21,"{'class_weight': None, 'criterion': 'entropy',...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,193
98,0.010231,0.004596,0.004609,0.000588,,gini,1,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.370370,0.351852,0.359259,0.353160,0.342007,0.355330,0.009338,193
2,0.009897,0.001469,0.004637,0.001573,balanced,gini,3,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.388889,0.303704,0.403704,0.427509,0.345725,0.373906,0.044064,192
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
44,0.013803,0.000847,0.004344,0.000764,balanced,gini,45,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
42,0.013322,0.001442,0.004052,0.000183,balanced,gini,43,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.866667,0.903704,0.873606,0.828996,0.872372,0.025179,4
21,0.012268,0.001065,0.003878,0.000191,balanced,gini,22,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.885185,0.862963,0.903704,0.881041,0.828996,0.872378,0.025263,3
24,0.012969,0.001639,0.004339,0.000443,balanced,gini,25,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.888889,0.874074,0.903704,0.873606,0.828996,0.873854,0.025018,2


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [13]:
rfc = RandomForestClassifier()
param_grid = {'n_estimators': [5, 10, 50, 100],
              'criterion': ['gini','entropy'],
              'max_depth': np.arange(1, 50),
              'class_weight': ['balanced', None],
              'random_state': [21]}
gs = GridSearchCV(rfc, param_grid, scoring='accuracy', n_jobs=-1)
gs.fit(X_train, y_train)

GridSearchCV(estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
                         'max_depth': array([ 1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17,
       18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
       35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49]),
                         'n_estimators': [5, 10, 50, 100],
                         'random_state': [21]},
             scoring='accuracy')

In [14]:
gs.best_params_

{'class_weight': 'balanced',
 'criterion': 'entropy',
 'max_depth': 24,
 'n_estimators': 100,
 'random_state': 21}

In [15]:
gs.best_score_

0.9042929918766351

In [17]:
results3 = pd.DataFrame(gs.cv_results_)
results3 = results3.sort_values('rank_test_score', ascending=False)
results3

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_class_weight,param_criterion,param_max_depth,param_n_estimators,param_random_state,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
196,0.023456,0.002470,0.005830,0.000187,balanced,entropy,1,5,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.262963,0.292593,0.225926,0.282528,0.289963,0.270794,0.024718,784
0,0.026667,0.002013,0.006630,0.001066,balanced,gini,1,5,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.262963,0.292593,0.285185,0.282528,0.293680,0.283390,0.011062,783
4,0.025066,0.002272,0.005824,0.000133,balanced,gini,2,5,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.311111,0.377778,0.377778,0.353160,0.312268,0.346419,0.029749,782
200,0.024603,0.001849,0.007032,0.001208,balanced,entropy,2,5,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.318519,0.366667,0.381481,0.353160,0.345725,0.353110,0.021165,781
588,0.022044,0.001703,0.006074,0.000240,,entropy,1,5,21,"{'class_weight': None, 'criterion': 'entropy',...",0.355556,0.366667,0.374074,0.345725,0.327138,0.353832,0.016467,780
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
515,0.473379,0.010166,0.042480,0.007533,,gini,31,100,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.918519,0.911111,0.900000,0.910781,0.877323,0.903547,0.014380,4
311,0.733262,0.015503,0.035407,0.001862,balanced,entropy,29,100,21,"{'class_weight': 'balanced', 'criterion': 'ent...",0.922222,0.900000,0.907407,0.907063,0.881041,0.903547,0.013380,4
118,0.232871,0.006826,0.020267,0.000984,balanced,gini,30,50,21,"{'class_weight': 'balanced', 'criterion': 'gin...",0.922222,0.903704,0.900000,0.907063,0.884758,0.903549,0.012056,3
502,0.229991,0.005221,0.018700,0.000681,,gini,28,50,21,"{'class_weight': None, 'criterion': 'gini', 'm...",0.922222,0.900000,0.907407,0.903346,0.888476,0.904290,0.010961,2


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [18]:
grid = list(ParameterGrid(param_grid))

In [19]:
data = []

for params in tqdm(grid):
    row = params
    estimator = RandomForestClassifier(**params)
    scores = cross_val_score(estimator, X_train, y_train, cv=5, n_jobs=-1)
    row['mean_accuracy'] = np.mean(scores)
    row['std_accuracy'] = np.std(scores)
    data.append(row)

  0%|          | 0/784 [00:00<?, ?it/s]

In [20]:
results4 = pd.DataFrame(data)
results4 = results4.sort_values('mean_accuracy', ascending=False)
results4

Unnamed: 0,class_weight,criterion,max_depth,n_estimators,random_state,mean_accuracy,std_accuracy
291,balanced,entropy,24,100,21,0.904293,0.012361
502,,gini,28,50,21,0.904290,0.010961
118,balanced,gini,30,50,21,0.903549,0.012056
515,,gini,31,100,21,0.903547,0.014380
311,balanced,entropy,29,100,21,0.903547,0.013380
...,...,...,...,...,...,...,...
588,,entropy,1,5,21,0.353832,0.016467
200,balanced,entropy,2,5,21,0.353110,0.021165
4,balanced,gini,2,5,21,0.346419,0.029749
0,balanced,gini,1,5,21,0.283390,0.011062


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [22]:
rfc = RandomForestClassifier(n_estimators=100, max_depth=24, criterion='entropy', class_weight='balanced', random_state=21)
rfc.fit(X_train, y_train)
y_pred = rfc.predict(X_test)

In [24]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9260355029585798