# Tuning with Grid Search - Random Forest (150 samples)

## 01- Import and Prepare Data

In [1]:
# Load libraries
import numpy as np
from numpy import arange
import pickle
from pandas import read_csv
#
import matplotlib.pyplot as plt

#
import pandas as pd
from pandas import read_csv

from sklearn.metrics import confusion_matrix,  classification_report, f1_score
from sklearn.model_selection import train_test_split, KFold,StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

import tensorflow
from sklearn.ensemble import RandomForestClassifier

import warnings
import seaborn as sbs
import sys

warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tensorflow.random.set_seed(RANDOM_SEED)
np.set_printoptions(threshold=sys.maxsize)

with open('data/X2.pkl', 'rb') as f:
    X = pickle.load(f)

with open('data/y2.pkl', 'rb') as f:
    y = pickle.load(f)
    
labels = 5
samples = 150
X = X[:labels*samples]
y = y[:labels*samples]

classes = np.unique(y).tolist()
for i in range(len(classes)):
    y = np.where(y==classes[i], i, y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

y = np.array(y)

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X.shape)

(750, 32)


## 02 - Prepare hyperparameters

Prendiamo in considerazione solamente gli iperparametri **'n_estimators'** ovvero il numero di alberi utilizzati nell'eseguire le predizioni e **'max_features'** ovvero il numero massimo di feature utilizzate per effettuare le predizioni.

In [9]:
# Load the model parameters to be test 
model_params = {
    'n_estimators': np.arange(10, 210, 50),
    'max_features':  np.arange(1, 33, 2),
    #'min_samples_split': [0.1, 0.3, 0.6],
    #'bootstrap': [True, False]
}

## 03 - Grid Search SKlearn

In [None]:
rf_model = RandomForestClassifier(random_state=RANDOM_SEED)

clf = GridSearchCV(rf_model, model_params, cv=5)
model = clf.fit(X_train, y_train)

In [11]:
print(f'Best params: {model.best_params_} with a score of {model.best_score_}')

Best params: {'max_features': 7, 'n_estimators': 110} with a score of 0.595


In [12]:
pred = model.predict(X_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.57      0.53      0.55        32
           1       0.53      0.62      0.57        26
           2       0.53      0.70      0.60        23
           3       0.60      0.72      0.65        25
           4       0.90      0.61      0.73        44

    accuracy                           0.63       150
   macro avg       0.63      0.64      0.62       150
weighted avg       0.66      0.63      0.63       150

