In [None]:
from sklearn import tree
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
from sklearn.model_selection import LeaveOneOut
from sklearn import metrics    # contains all the effectivness idexes 
import pandas as pd    # for the dataset
from matplotlib import pyplot as plt
from statistics import mean
import numpy as np

#### getting the dataset

In [None]:
labels = ['class', 'spec_num', 'spec_number', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy']

df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
display(df)

In [None]:
t = tree.DecisionTreeClassifier()

X = df.iloc[:, 2:16]
y = df.iloc[:, 0]
display(X)

In [None]:
# initializing a first tree

# min_sample_split is the minimum number of observations allowed in a node to perform a split
# criterion is the index used while splitting to find the best branch
# min_impurity_decrease is minimum amount of impurity decrease after a split allowed to perform a split

t_first = tree.DecisionTreeClassifier(min_samples_split=2, criterion='gini', min_impurity_decrease=0.)

In [None]:
# using grid search to find the best hyperparameters and fitting the tree

grid_param = {'criterion': ['gini', 'entropy'], 'min_samples_split': np.array(range(2, 8))}
t = GridSearchCV(tree.DecisionTreeClassifier(), grid_param, cv=5, scoring='accuracy')
t.fit(X, y)
print(t.best_params_)
print(t.best_score_)

#### first attempt to fit a tree

In [None]:
t_first.fit(X, y)

In [None]:
tree.plot_tree(t_first)
plt.savefig("tree", dpi=1000)

In [None]:
# prints effectiveness (should be maximum because min_sample_split=2)

print(t_first.score(X, y))

#### evaluation of the technique using cross validation

In [None]:
# names of all effectivness indexes available in sklearn

# note that roc_auc doesn't work in cv because it is a multiclass classification (we need to specify
# ovo or ovr) and that all roc_auc variants don't work in loocv because the testing sets contain
# only one observation

print(metrics.get_scorer_names())

In [None]:
# k-fold cross validation
k = 5

effect_cv = cross_validate(t, X, y, cv=k, scoring=('accuracy', 'roc_auc_ovo', 'roc_auc_ovr'))

In [None]:
effect = cross_validate(t, X, y, cv=LeaveOneOut(), scoring='accuracy')

print(effect)

In [None]:
print(type(t))
print(type(t_first))