# Day 09. Exercise 01
# Gridsearch

## 0. Imports

In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score

from tqdm.autonotebook import tqdm

## 1. Preprocessing

1. Read the file [`day-of-week-not-scaled.csv`](https://drive.google.com/file/d/1AlGvsJDSzPT_70caausx8bFuupIEZkfh/view?usp=sharing). It is similar to the one from the previous exercise, but this time we did not scale continuous features (we are not going to use logreg anymore). Don't forget to enrich the table with the 'dayofweek' column from the previous day's .csv-file.
2. Using `train_test_split` with parameters `test_size=0.2`, `random_state=21` get `X_train`, `y_train`, `X_test`, `y_test`. Use the additional parameter `stratify`.

In [2]:
df = pd.read_csv('../data/day-of-week-not-scaled.csv')
enrichment = pd.read_csv('../data/dayofweek.csv')
df['dayofweek'] = enrichment['dayofweek']
df.shape

(1686, 44)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('dayofweek', axis=1), df['dayofweek'], test_size=0.2, random_state=21, stratify=df['dayofweek'])

## 2. SVM gridsearch

1. Using `GridSearchCV` try different parameters of kernel (`linear`, `rbf`, `sigmoid`), C (`0.01`, `0.1`, `1`, `1.5`, `5`, `10`), gamma (`scale`, `auto`), class_weight (`balanced`, `None`) use `random_state=21` and `probability=True` and get the best combination of them in terms of accuracy.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`. Check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [4]:
from sklearn.model_selection import GridSearchCV

def grid_search(model, param_grid, X_train, y_train, X_test, y_test, **kwargs) -> object:
    if kwargs:
        model.set_params(**kwargs)

    grid_search = GridSearchCV(model, param_grid, cv=5, scoring='accuracy', return_train_score=True, n_jobs=-1)
    grid_search.fit(X_train, y_train)

    print("Best parameters:")
    for param, value in grid_search.best_params_.items():
        print(f"{param}: {value}")

    print(f"\nBest cross-validation accuracy: {grid_search.best_score_:.4f}")

    best_model = grid_search.best_estimator_
    test_score = best_model.score(X_test, y_test)
    print(f"Test set accuracy: {test_score:.4f}")

    return grid_search

In [33]:
def find_best_model(grid_result, important_columns) -> object:
    results_df = pd.DataFrame(grid_result.cv_results_)
    if 'params' not in important_columns:
        display_columns = important_columns + ['params']
    else:
        display_columns = important_columns.copy()

    sorted_results = results_df[display_columns].sort_values('rank_test_score', ascending=True)
    top_score = sorted_results.iloc[0]['mean_test_score']
    second_score = sorted_results.iloc[1]['mean_test_score']
    score_diff = top_score - second_score
    print(f"\nTop model params: {sorted_results.iloc[0]['params']}")
    print(f"\nSecond top model params: {sorted_results.iloc[1]['params']}")
    print(f"\nDifference between 2 top models: {score_diff:.4f}")
    return sorted_results

In [None]:
param_grid_SVC = {
    'kernel': ['linear', 'rbf', 'sigmoid'],
    'C': [0.01, 0.1, 1, 1.5, 5, 10],
    'gamma': ['scale', 'auto'],
    'class_weight': ['balanced', None],
    'random_state': [21],
    'probability': [True]
}
grid_result_SVC = grid_search(SVC(), param_grid_SVC, X_train, y_train, X_test, y_test, random_state=21, probability=True)

Best parameters:
C: 10
class_weight: None
gamma: auto
kernel: rbf

Best cross-validation accuracy: 0.8761
Test set accuracy: 0.8876


In [34]:
important_columns_SVC = [
    'rank_test_score',
    'mean_test_score', 
    'std_test_score',
    'param_kernel',
    'param_C',
    'param_gamma',
    'param_class_weight'
    ]

results_SVC = find_best_model(grid_result_SVC, important_columns_SVC)


Top model params: {'C': 10, 'class_weight': None, 'gamma': 'auto', 'kernel': 'rbf'}

Second top model params: {'C': 10, 'class_weight': 'balanced', 'gamma': 'auto', 'kernel': 'rbf'}

Difference between 2 top models: 0.0126


In [35]:
results_SVC.head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_kernel,param_C,param_gamma,param_class_weight,params
70,1,0.876109,0.018419,rbf,10,auto,,"{'C': 10, 'class_weight': None, 'gamma': 'auto..."
64,2,0.8635,0.01087,rbf,10,auto,balanced,"{'C': 10, 'class_weight': 'balanced', 'gamma':..."
58,3,0.816018,0.008116,rbf,5,auto,,"{'C': 5, 'class_weight': None, 'gamma': 'auto'..."
52,4,0.808608,0.021007,rbf,5,auto,balanced,"{'C': 5, 'class_weight': 'balanced', 'gamma': ..."
63,5,0.721052,0.034438,linear,10,auto,balanced,"{'C': 10, 'class_weight': 'balanced', 'gamma':..."


## 3. Decision tree

1. Using `GridSearchCV` try different parameters of `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use `random_state=21`.
2. Create a dataframe from the results of the gridsearch and sort it ascendingly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [9]:
param_grid_tree = {
    'max_depth': range(1, 50),
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', None]
}

In [10]:
grid_results_tree = grid_search(DecisionTreeClassifier(), param_grid_tree, X_train, y_train, X_test, y_test, random_state=21)

Best parameters:
class_weight: balanced
criterion: gini
max_depth: 21

Best cross-validation accuracy: 0.8739
Test set accuracy: 0.8846


In [37]:
important_columns_tree = [
    'rank_test_score',
    'mean_test_score', 
    'std_test_score',
    'param_max_depth',
    'param_criterion',
    'param_class_weight'
]


results_tree = find_best_model(grid_results_tree, important_columns_tree)


Top model params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 21}

Second top model params: {'class_weight': 'balanced', 'criterion': 'gini', 'max_depth': 25}

Difference between 2 top models: 0.0000


In [38]:
results_tree.head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_max_depth,param_criterion,param_class_weight,params
20,1,0.873865,0.025066,21,gini,balanced,"{'class_weight': 'balanced', 'criterion': 'gin..."
24,2,0.873854,0.025018,25,gini,balanced,"{'class_weight': 'balanced', 'criterion': 'gin..."
21,3,0.872378,0.025263,22,gini,balanced,"{'class_weight': 'balanced', 'criterion': 'gin..."
30,4,0.872372,0.025179,31,gini,balanced,"{'class_weight': 'balanced', 'criterion': 'gin..."
28,4,0.872372,0.025179,29,gini,balanced,"{'class_weight': 'balanced', 'criterion': 'gin..."


## 4. Random forest

1. Using `GridSearchCV` try different parameters of `n_estimators` (`5`, `10`, `50`, `100`), `max_depth` (from `1` to `49`), `class_weight` (`balanced`, `None`) and `criterion` (`entropy` and `gini`) and get the best combination of them in terms of accuracy. Use random_state=21.
2. Create a dataframe from the results of the gridsearch and sort it ascendengly by the `rank_test_score`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [13]:
param_grid_forest = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': range(1, 50),
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', None]
}

In [14]:
grid_results_forest = grid_search(RandomForestClassifier(), param_grid_forest, X_train, y_train, X_test, y_test, random_state=21)

Best parameters:
class_weight: balanced
criterion: entropy
max_depth: 24
n_estimators: 100

Best cross-validation accuracy: 0.9043
Test set accuracy: 0.9260


In [39]:
important_columns_forest = [
    'rank_test_score',
    'mean_test_score', 
    'std_test_score',
    'param_n_estimators',
    'param_max_depth',
    'param_criterion',
    'param_class_weight'
]

results_forest = find_best_model(grid_results_forest, important_columns_forest)


Top model params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 24, 'n_estimators': 100}

Second top model params: {'class_weight': 'balanced', 'criterion': 'entropy', 'max_depth': 29, 'n_estimators': 100}

Difference between 2 top models: 0.0000


In [40]:
results_forest.head()

Unnamed: 0,rank_test_score,mean_test_score,std_test_score,param_n_estimators,param_max_depth,param_criterion,param_class_weight,params
291,1,0.904293,0.012361,100,24,entropy,balanced,"{'class_weight': 'balanced', 'criterion': 'ent..."
311,2,0.90429,0.012156,100,29,entropy,balanced,"{'class_weight': 'balanced', 'criterion': 'ent..."
502,2,0.90429,0.010961,50,28,gini,,"{'class_weight': None, 'criterion': 'gini', 'm..."
118,4,0.903549,0.012056,50,30,gini,balanced,"{'class_weight': 'balanced', 'criterion': 'gin..."
515,5,0.903547,0.01438,100,31,gini,,"{'class_weight': None, 'criterion': 'gini', 'm..."


## 5. Progress bar

Gridsearch can be a quite long process and you may find yourself wondering when it will end.
1. Create a manual gridsearch for the same parameters values of random forest iterating through the list of the possible values and calculating `cross_val_score` for each combination. Try to increase `n_jobs`. The value `cv` for `cross_val_score` is 5.
2. Track the progress using the library `tqdm.notebook`.
3. Create a dataframe from the results of the gridsearch with the columns corresponding to the names of the parameters and `mean_accuracy` and `std_accuracy`.
4. Sort it descendingly by the `mean_accuracy`, check if there is a huge difference between different combinations (sometimes a simpler model may give a comparable result).

In [17]:
param_grid_forest = {
    'n_estimators': [5, 10, 50, 100],
    'max_depth': range(1, 50),
    'criterion': ['gini', 'entropy'],
    'class_weight': ['balanced', None]
}

In [18]:
from itertools import product
all_params = list(product(*param_grid_forest.values()))
param_names = list(param_grid_forest.keys())

In [19]:
tqdm.pandas()  
pbar = tqdm(all_params, desc="Grid Search Progress")

Grid Search Progress:   0%|          | 0/784 [00:00<?, ?it/s]

In [20]:
results = []
for params in pbar:
    param_dict = dict(zip(param_names, params))
    
    rf = RandomForestClassifier(**param_dict, random_state=21)
    
    scores = cross_val_score(rf, X_train, y_train, cv=5, n_jobs=-1)

    results.append({
        **param_dict,
        'mean_accuracy': np.mean(scores),
        'std_accuracy': np.std(scores)
    })

Grid Search Progress:   0%|          | 1/784 [00:00<01:58,  6.61it/s]

Grid Search Progress: 100%|██████████| 784/784 [04:33<00:00,  2.86it/s]


In [21]:
manual_grid_results = pd.DataFrame(results)
manual_grid_results

Unnamed: 0,n_estimators,max_depth,criterion,class_weight,mean_accuracy,std_accuracy
0,5,1,gini,balanced,0.283390,0.011062
1,5,1,gini,,0.364219,0.021651
2,5,1,entropy,balanced,0.270794,0.024718
3,5,1,entropy,,0.353832,0.016467
4,5,2,gini,balanced,0.346419,0.029749
...,...,...,...,...,...,...
779,100,48,entropy,,0.898362,0.014986
780,100,49,gini,balanced,0.894652,0.015726
781,100,49,gini,,0.902806,0.010460
782,100,49,entropy,balanced,0.898359,0.013872


In [22]:
manual_grid_results = manual_grid_results.sort_values('mean_accuracy', ascending=False)
manual_grid_results

Unnamed: 0,n_estimators,max_depth,criterion,class_weight,mean_accuracy,std_accuracy
682,100,24,entropy,balanced,0.904293,0.012361
702,100,29,entropy,balanced,0.904290,0.012156
501,50,28,gini,,0.904290,0.010961
508,50,30,gini,balanced,0.903549,0.012056
709,100,31,gini,,0.903547,0.014380
...,...,...,...,...,...,...
3,5,1,entropy,,0.353832,0.016467
6,5,2,entropy,balanced,0.353110,0.021165
4,5,2,gini,balanced,0.346419,0.029749
0,5,1,gini,balanced,0.283390,0.011062


In [23]:
top_acc = manual_grid_results['mean_accuracy'].iloc[0]
second_top_acc = manual_grid_results['mean_accuracy'].iloc[1]
top_acc - second_top_acc

2.7536830512486787e-06

In [24]:
n_est = 100
max_d = 49
top_acc = manual_grid_results['mean_accuracy'].max()
top_1_percent = manual_grid_results.nlargest(int(len(manual_grid_results)*0.01), 'mean_accuracy')


# ищем среди 1% топ моделей наиболее простую
simplest_top_models = top_1_percent.sort_values(
    ['n_estimators', 'max_depth'], 
    ascending=[True, True]
)

print(f"Simplest models among top 1% performers (accuracy ≥ {top_acc:.4f}):")
display(simplest_top_models.head(10))

good_models = manual_grid_results[
    (manual_grid_results['mean_accuracy'] >= top_acc - 0.01) &  
    (manual_grid_results['n_estimators'] <= n_est) &        
    (manual_grid_results['max_depth'] <= max_d) 
].sort_values('mean_accuracy', ascending=False)

print("\nMost accurate models within complexity limits:")
display(good_models.head(10))

Simplest models among top 1% performers (accuracy ≥ 0.9043):


Unnamed: 0,n_estimators,max_depth,criterion,class_weight,mean_accuracy,std_accuracy
501,50,28,gini,,0.90429,0.010961
508,50,30,gini,balanced,0.903549,0.012056
520,50,33,gini,balanced,0.902809,0.013628
682,100,24,entropy,balanced,0.904293,0.012361
686,100,25,entropy,balanced,0.902809,0.013639
702,100,29,entropy,balanced,0.90429,0.012156
709,100,31,gini,,0.903547,0.01438



Most accurate models within complexity limits:


Unnamed: 0,n_estimators,max_depth,criterion,class_weight,mean_accuracy,std_accuracy
682,100,24,entropy,balanced,0.904293,0.012361
501,50,28,gini,,0.90429,0.010961
702,100,29,entropy,balanced,0.90429,0.012156
508,50,30,gini,balanced,0.903549,0.012056
709,100,31,gini,,0.903547,0.01438
686,100,25,entropy,balanced,0.902809,0.013639
520,50,33,gini,balanced,0.902809,0.013628
781,100,49,gini,,0.902806,0.01046
769,100,46,gini,,0.902806,0.01046
753,100,42,gini,,0.902806,0.01046


## 6. Predictions

1. Choose the best model and use it to make predictions for the test dataset.
2. Calculate the final accuracy.

In [25]:
best_model = RandomForestClassifier(n_estimators=50, max_depth=28, criterion='gini', class_weight=None)

In [26]:
best_model.fit(X_train, y_train)

RandomForestClassifier(max_depth=28, n_estimators=50)

In [27]:
y_pred = best_model.predict(X_test)

In [28]:
final_acc = accuracy_score(y_test, y_pred)

In [29]:
final_acc

0.9289940828402367