# Tuning with Grid Search - Random Forest (150 samples)

## 01- Import and Prepare Data

In [12]:
# Load libraries
import numpy as np
from numpy import arange
import pickle
import pandas as pd
#
import matplotlib.pyplot as plt

#
import pandas as pd
from pandas import read_csv

from sklearn.metrics import confusion_matrix,  classification_report, f1_score, accuracy_score

from sklearn.model_selection import train_test_split, KFold,StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

import tensorflow
from sklearn.ensemble import RandomForestClassifier

import warnings
import seaborn as sbs
import sys

warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tensorflow.random.set_seed(RANDOM_SEED)
np.set_printoptions(threshold=sys.maxsize)

with open('data/X2.pkl', 'rb') as f:
    X = pickle.load(f)

with open('data/y2.pkl', 'rb') as f:
    y = pickle.load(f)
    
labels = 5
samples = 150
X = X[:labels*samples]
y = y[:labels*samples]

classes = np.unique(y).tolist()
for i in range(len(classes)):
    y = np.where(y==classes[i], i, y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

y = np.array(y)

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X.shape)

(750, 32)


## 02 - Prepare hyperparameters

Prendiamo in considerazione solamente gli iperparametri **'n_estimators'** ovvero il numero di alberi utilizzati nell'eseguire le predizioni e **'max_features'** ovvero il numero massimo di feature utilizzate per effettuare le predizioni.

In [13]:
# Load the model parameters to be test 
model_params = {
    'n_estimators': np.arange(100, 300, 50),
    'max_features':  np.arange(1, 33, 4),
    'criterion': ['gini', 'entropy'], 
    'max_depth': [None, 2, 4],
    'min_samples_split': np.arange(2, 6, 2),
    'min_samples_leaf': np.arange(1, 4, 1),
}
num_folds = 5

## 03 - Grid Search SKlearn

In [14]:
rf_model = RandomForestClassifier(random_state=RANDOM_SEED)

clf = GridSearchCV(rf_model, model_params, cv=num_folds)
model = clf.fit(X_train, y_train)

In [15]:
print(f'Best params: {model.best_params_} with a score of {model.best_score_}')

Best params: {'criterion': 'entropy', 'max_depth': None, 'max_features': 9, 'min_samples_leaf': 1, 'min_samples_split': 4, 'n_estimators': 250} with a score of 0.6016666666666667


In [16]:
pred = model.predict(X_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.57      0.61      0.59        28
           1       0.57      0.68      0.62        25
           2       0.47      0.70      0.56        20
           3       0.67      0.65      0.66        31
           4       0.93      0.61      0.74        46

    accuracy                           0.64       150
   macro avg       0.64      0.65      0.63       150
weighted avg       0.69      0.64      0.65       150



## 04 - Grid-Search reiplemented

In [5]:
def gridSearch(params, X_train, X_test, y_train, y_test):
    names = list(params.keys())
    values = list(params.values())
    std_dev = 0
    best_score, best_values = 0, [0 for item in values]
    global_results = []
    for estimators in values[0]:
        for features in values[1]:
            for crit in values[2]:
                for depth in values[3]:
                    for min_split in values[4]:
                        for min_leaf in values[5]:
                            model = RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=estimators, 
                                                           max_features=features, criterion=crit, max_depth=depth,
                                                           min_samples_split=min_split, min_samples_leaf=min_leaf
                                                          )
                            # Cross Validation
                            kf = StratifiedKFold(n_splits=num_folds, shuffle=False)
                            cv_results = np.array([])
                            for train_idx, test_idx, in kf.split(X_train, y_train):
                                X_cross_train, y_cross_train = X_train[train_idx], y_train[train_idx]
                                X_cross_train = scaler.fit_transform(X_cross_train)
                                X_cross_test, y_cross_test = X_train[test_idx], y_train[test_idx]
                                X_cross_test = scaler.transform(X_cross_test)
                                model.fit(X_cross_train, y_cross_train)  
                                y_pred = model.predict(X_cross_test)
                                acc = accuracy_score(y_cross_test, y_pred)
                                cv_results = np.append(cv_results, [acc])
                                global_results.append([estimators, features, cv_results.mean()])
                            # Comparing score
                            if cv_results.mean() > best_score:
                                best_score = cv_results.mean()
                                best_values = [estimators, features, crit, depth, min_split, min_leaf]
                                std_dev = cv_results.std()
                                
    # Print all the combinations results
    df = pd.DataFrame(global_results)
    df.columns = ['N_ESTIMATORS', 'MAX_FEATURES', 'SCORE']
    print(df)
    return best_score, best_values, std_dev

best_score, best_combo, std_dev = gridSearch(model_params, X_train, X_test, y_train, y_test)

      N_ESTIMATORS  MAX_FEATURES     SCORE
0              100             1  0.525000
1              100             1  0.529167
2              100             1  0.563889
3              100             1  0.566667
4              100             1  0.553333
...            ...           ...       ...
5755           250            29  0.533333
5756           250            29  0.566667
5757           250            29  0.594444
5758           250            29  0.572917
5759           250            29  0.546667

[5760 rows x 3 columns]


In [11]:

print(f'Best score = {best_score}, Standard Deviation:{std_dev} \nThe best combo is:')
index = 0
for param in model_params.keys():
    print(f' - {param}: {best_combo[index]}')
    index += 1


Best score = 0.6016666666666667, Standard Deviation:0.06999999999999998 
The best combo is:
 - n_estimators: 250
 - max_features: 9
 - criterion: entropy
 - max_depth: None
 - min_samples_split: 4
 - min_samples_leaf: 1
