## Single tree

In [None]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics    # contains all the effectivness idexes 
import pandas as pd    # for the dataset
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

#### dataset

In [None]:
#setting up labels for dataset
labels = ['class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']
#importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
#separating y from x
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

#### finding best hyper-parameters

In [None]:
# names of all effectivness indexes available in sklearn

# note that roc_auc doesn't work in cv because it is a multiclass classification (we need to specify
# ovo or ovr) and that all roc_auc variants don't work in loocv because the testing sets contain
# only one observation

print(metrics.get_scorer_names())

In [None]:
# using grid search with k-fold CV to find the best hyperparameters and fitting the tree

# se refit = 'blabla', alla fine viene rifittato l'albero su tutto il dataset scegliendo i parametri
# migliori in base all'indice di accuracy 'blabla'

k = 5

# griglia dei parametri su cui fare la ricerca
grid_param = {'criterion': ['gini', 'entropy'], 'min_samples_split': np.arange(2, 15)}

# 'preparazione' indici di effectivness
# si usa la funzione make_scorer per costruire le metriche che ci servono
b_accuracy = metrics.make_scorer(metrics.balanced_accuracy_score)
recall = metrics.make_scorer(metrics.recall_score, average='weighted')
auc_ovo = metrics.make_scorer(metrics.roc_auc_score, multi_class='ovo', needs_proba=True, average='weighted')
auc_ovr = metrics.make_scorer(metrics.roc_auc_score, multi_class='ovr', needs_proba=True, average='weighted')
scorers = {'balanced_accuracy': b_accuracy, 'recall': recall, 'roc_auc_ovo': auc_ovo, 'roc_auc_ovr': auc_ovr}

clf_cv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param, cv=k, scoring=scorers, refit=False)
clf_cv.fit(X, y)

In [None]:
# showing the mean values of effectivness indexes

results_cv = pd.DataFrame(clf_cv.cv_results_)

display(results_cv.loc[:, ('params', 'mean_test_balanced_accuracy', 'rank_test_balanced_accuracy', 'mean_test_recall', 'rank_test_recall', 'mean_test_roc_auc_ovo', 'rank_test_roc_auc_ovo', 'mean_test_roc_auc_ovr', 'rank_test_roc_auc_ovr')])

In [None]:
# using now grid search with loocv

clf_loocv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param, cv=LeaveOneOut(), scoring='accuracy')
clf_loocv.fit(X, y)

In [None]:
print("best parameters: " + str(clf_loocv.best_params_))
print("accuracy loocv: " + str(clf_loocv.best_score_))