# Tuning with Grid Search - Random Forest (150 samples)

## 01- Import and Prepare Data

In [31]:
# Load libraries
import numpy as np
from numpy import arange
import pickle
import pandas as pd
#
import matplotlib.pyplot as plt

#
import pandas as pd
from pandas import read_csv

from sklearn.metrics import confusion_matrix,  classification_report, f1_score, accuracy_score

from sklearn.model_selection import train_test_split, KFold,StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

import tensorflow
from sklearn.ensemble import RandomForestClassifier

import warnings
import seaborn as sbs
import sys

warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tensorflow.random.set_seed(RANDOM_SEED)
np.set_printoptions(threshold=sys.maxsize)

with open('data/X2.pkl', 'rb') as f:
    X = pickle.load(f)

with open('data/y2.pkl', 'rb') as f:
    y = pickle.load(f)
    
labels = 5
samples = 150
X = X[:labels*samples]
y = y[:labels*samples]

classes = np.unique(y).tolist()
for i in range(len(classes)):
    y = np.where(y==classes[i], i, y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

y = np.array(y)

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X.shape)

(750, 32)


## 02 - Prepare hyperparameters

Prendiamo in considerazione solamente gli iperparametri **'n_estimators'** ovvero il numero di alberi utilizzati nell'eseguire le predizioni e **'max_features'** ovvero il numero massimo di feature utilizzate per effettuare le predizioni.

In [15]:
# Load the model parameters to be test 
model_params = {
    'n_estimators': np.arange(50, 300, 50),
    'max_features':  np.arange(1, 33, 2),
    #'min_samples_split': [0.1, 0.3, 0.6],
    #'bootstrap': [True, False]
}
num_folds = 5

## 03 - Grid Search SKlearn

In [16]:
rf_model = RandomForestClassifier(random_state=RANDOM_SEED)

clf = GridSearchCV(rf_model, model_params, cv=num_folds)
model = clf.fit(X_train, y_train)

In [17]:
print(f'Best params: {model.best_params_} with a score of {model.best_score_}')

Best params: {'max_features': 7, 'n_estimators': 200} with a score of 0.6066666666666667


In [18]:
pred = model.predict(X_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.57      0.55      0.56        31
           1       0.53      0.62      0.57        26
           2       0.50      0.71      0.59        21
           3       0.67      0.71      0.69        28
           4       0.90      0.61      0.73        44

    accuracy                           0.63       150
   macro avg       0.63      0.64      0.63       150
weighted avg       0.67      0.63      0.64       150



## 04 - Grid-Search reiplemented

In [33]:
def gridSearch(params, X_train, X_test, y_train, y_test):
    names = list(params.keys())
    values = list(params.values())
    best_score, best_values = 0, [0 for item in values]
    global_results = []
    for estimators in values[0]:
        for features in values[1]:
            model = RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=estimators, max_features=features)
            # Cross Validation
            kf = StratifiedKFold(n_splits=num_folds, random_state=RANDOM_SEED, shuffle=False)
            cv_results = np.array([])
            for train_idx, test_idx, in kf.split(X_train, y_train):
                X_cross_train, y_cross_train = X_train[train_idx], y_train[train_idx]
                X_cross_train = scaler.fit_transform(X_cross_train)
                X_cross_test, y_cross_test = X_train[test_idx], y_train[test_idx]
                X_cross_test = scaler.transform(X_cross_test)
                model.fit(X_cross_train, y_cross_train)  
                y_pred = model.predict(X_cross_test)
                acc = accuracy_score(y_cross_test, y_pred)
                cv_results = np.append(cv_results, [acc])
                global_results.append([estimators, features, cv_results.mean()])
            # Comparing score
            if cv_results.mean() > best_score:
                best_score = cv_results.mean()
                best_values = [estimators, features]
    # Print all the combinations results
    df = pd.DataFrame(global_results)
    df.columns = ['N_ESTIMATORS', 'MAX_FEATURES', 'SCORE']
    print(df)
    return best_score, best_values

best_score, best_combo = gridSearch(model_params, X_train, X_test, y_train, y_test)
print(f'Best score = {best_score}, the best combo is: {best_combo[0]} estimators and {best_combo[1]} features')


     N_ESTIMATORS  MAX_FEATURES     SCORE
0              50             1  0.558333
1              50             1  0.575000
2              50             1  0.566667
3              50             1  0.566667
4              50             1  0.550000
..            ...           ...       ...
395           250            31  0.591667
396           250            31  0.616667
397           250            31  0.613889
398           250            31  0.610417
399           250            31  0.585000

[400 rows x 3 columns]
Best score = 0.6066666666666667, the best combo is: 200 estimators and 7 features
