In [1]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics    # contains all the effectivness idexes 
import pandas as pd    # for the dataset
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#### dataset

In [2]:
#setting up labels for dataset
labels = ['class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']
#importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
#separating y from x
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

#### finding best hyper-parameters by cross validation

In [3]:
# using grid search with k-fold CV to find the best hyperparameters and fitting the tree

# se refit = 'blabla', alla fine viene rifittato l'albero su tutto il dataset scegliendo i parametri
# migliori in base all'indice di accuracy 'blabla'

k = 5

# griglia dei parametri su cui fare la ricerca
grid_param = {"n_estimators": (100, 200, 500, 700), 'criterion': ('gini', 'entropy'), 'max_features': ('sqrt', 5)}

# 'preparazione' indici di effectivness
# si usa la funzione make_scorer per costruire le metriche che ci servono
b_accuracy = metrics.make_scorer(metrics.balanced_accuracy_score)
recall = metrics.make_scorer(metrics.recall_score, average='weighted')
auc_ovo = metrics.make_scorer(metrics.roc_auc_score, multi_class='ovo', needs_proba=True, average='weighted')
auc_ovr = metrics.make_scorer(metrics.roc_auc_score, multi_class='ovr', needs_proba=True, average='weighted')
scorers = {'balanced_accuracy': b_accuracy, 'recall': recall, 'roc_auc_ovo': auc_ovo, 'roc_auc_ovr': auc_ovr}

clf_cv = GridSearchCV(RandomForestClassifier(), grid_param, cv=k, scoring=scorers, refit=False)
clf_cv.fit(X, y)

In [4]:
# showing the mean values of effectivness indexes

results_cv = pd.DataFrame(clf_cv.cv_results_)

display(results_cv.loc[:, ('params', 'mean_test_balanced_accuracy', 'rank_test_balanced_accuracy', 'mean_test_recall', 'rank_test_recall', 'mean_test_roc_auc_ovo', 'rank_test_roc_auc_ovo', 'mean_test_roc_auc_ovr', 'rank_test_roc_auc_ovr')])

Unnamed: 0,params,mean_test_balanced_accuracy,rank_test_balanced_accuracy,mean_test_recall,rank_test_recall,mean_test_roc_auc_ovo,rank_test_roc_auc_ovo,mean_test_roc_auc_ovr,rank_test_roc_auc_ovr
0,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.772222,9,0.773529,8,0.984255,11,0.984565,11
1,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.8,1,0.802941,1,0.986355,7,0.986438,5
2,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.772222,9,0.770588,10,0.986,9,0.985857,9
3,"{'criterion': 'gini', 'max_features': 'sqrt', ...",0.786667,3,0.785294,3,0.988354,2,0.988282,2
4,"{'criterion': 'gini', 'max_features': 5, 'n_es...",0.772222,9,0.770588,10,0.983278,12,0.983406,12
5,"{'criterion': 'gini', 'max_features': 5, 'n_es...",0.785556,4,0.782353,5,0.986558,6,0.986367,7
6,"{'criterion': 'gini', 'max_features': 5, 'n_es...",0.783333,7,0.782353,5,0.988371,1,0.988369,1
7,"{'criterion': 'gini', 'max_features': 5, 'n_es...",0.786667,2,0.785294,2,0.986101,8,0.986013,8
8,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.773333,8,0.770588,9,0.980485,15,0.980557,15
9,"{'criterion': 'entropy', 'max_features': 'sqrt...",0.761111,15,0.758824,15,0.983152,13,0.983226,13


#### finding best hyper-parameters by leave one out cross validation