# Tuning with Grid Search - Random Forest (150 samples)

## 01- Import and Prepare Data

In [7]:
# Load libraries
import numpy as np
import pickle
#
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.metrics import confusion_matrix,  classification_report, f1_score, accuracy_score

from sklearn.model_selection import train_test_split, KFold,StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, RobustScaler

import tensorflow
from sklearn.ensemble import RandomForestClassifier

import warnings
import seaborn as sbs
import sys

warnings.filterwarnings('ignore')

RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
tensorflow.random.set_seed(RANDOM_SEED)
np.set_printoptions(threshold=sys.maxsize)

with open('data/X2.pkl', 'rb') as f:
    X = pickle.load(f)

with open('data/y2.pkl', 'rb') as f:
    y = pickle.load(f)
    
labels = 2
samples = 150
X = X[:labels*samples]
y = y[:labels*samples]

classes = np.unique(y).tolist()
for i in range(len(classes)):
    y = np.where(y==classes[i], i, y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=RANDOM_SEED)

y = np.array(y)

scaler = RobustScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
print(X.shape)

(300, 32)


## 02 - Prepare hyperparameters

In [8]:
# Load the model parameters to be test 
model_params = {
    'n_estimators': np.arange(50, 300, 50),
    'max_features':  np.arange(1, 33, 4),
    'criterion': ['gini', 'entropy'], 
    'max_depth': [None, 2, 4],
    'min_samples_split': np.arange(2, 6, 2),
    'min_samples_leaf': np.arange(1, 4, 1),
}
num_folds = 5

## 03 - Grid Search SKlearn

In [3]:
rf_model = RandomForestClassifier(random_state=RANDOM_SEED)

clf = GridSearchCV(rf_model, model_params, cv=num_folds)
model = clf.fit(X_train, y_train)

In [4]:
print(f'Best params: {model.best_params_} with a score of {model.best_score_}')

Best params: {'criterion': 'gini', 'max_depth': 4, 'max_features': 17, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200} with a score of 0.8458333333333333


In [5]:
pred = model.predict(X_test)

print(classification_report(pred, y_test))

              precision    recall  f1-score   support

           0       0.83      0.89      0.86        28
           1       0.90      0.84      0.87        32

    accuracy                           0.87        60
   macro avg       0.87      0.87      0.87        60
weighted avg       0.87      0.87      0.87        60



## 04 - Grid-Search reiplemented

In [9]:
def gridSearch(params, X_train, X_test, y_train, y_test):
    names = list(params.keys())
    values = list(params.values())
    std_dev = 0
    best_score, best_values = 0, [0 for item in values]
    global_results = []
    for estimators in values[0]:
        for features in values[1]:
            for crit in values[2]:
                for depth in values[3]:
                    for min_split in values[4]:
                        for min_leaf in values[5]:
                            model = RandomForestClassifier(random_state=RANDOM_SEED, n_estimators=estimators, 
                                                           max_features=features, criterion=crit, max_depth=depth,
                                                           min_samples_split=min_split, min_samples_leaf=min_leaf
                                                          )
                            # Cross Validation
                            kf = StratifiedKFold(n_splits=num_folds, shuffle=False)
                            cv_results = np.array([])
                            for train_idx, test_idx, in kf.split(X_train, y_train):
                                X_cross_train, y_cross_train = X_train[train_idx], y_train[train_idx]
                                X_cross_train = scaler.fit_transform(X_cross_train)
                                X_cross_test, y_cross_test = X_train[test_idx], y_train[test_idx]
                                X_cross_test = scaler.transform(X_cross_test)
                                model.fit(X_cross_train, y_cross_train)  
                                y_pred = model.predict(X_cross_test)
                                acc = accuracy_score(y_cross_test, y_pred)
                                cv_results = np.append(cv_results, [acc])
                                global_results.append([estimators, features, cv_results.mean()])
                            # Comparing score
                            if cv_results.mean() > best_score:
                                best_score = cv_results.mean()
                                best_values = [estimators, features, crit, depth, min_split, min_leaf]
                                std_dev = cv_results.std()
                                
    # Print all the combinations results
    df = pd.DataFrame(global_results)
    df.columns = ['N_ESTIMATORS', 'MAX_FEATURES', 'SCORE']
    print(df)
    return best_score, best_values, std_dev

best_score, best_combo, std_dev = gridSearch(model_params, X_train, X_test, y_train, y_test)

      N_ESTIMATORS  MAX_FEATURES     SCORE
0               50             1  0.750000
1               50             1  0.739583
2               50             1  0.777778
3               50             1  0.776042
4               50             1  0.775000
...            ...           ...       ...
7195           250            29  0.750000
7196           250            29  0.791667
7197           250            29  0.819444
7198           250            29  0.828125
7199           250            29  0.837500

[7200 rows x 3 columns]


In [10]:

print(f'Best score = {best_score}, Standard Deviation:{std_dev} \nThe best combo is:')
index = 0
for param in model_params.keys():
    print(f' - {param}: {best_combo[index]}')
    index += 1


Best score = 0.8458333333333334, Standard Deviation:0.05368374469468801 
The best combo is:
 - n_estimators: 200
 - max_features: 17
 - criterion: gini
 - max_depth: 4
 - min_samples_split: 2
 - min_samples_leaf: 1
