## proglie

In [None]:
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut
import pandas as pd    # for the dataset
import numpy as np
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

# setting up labels for dataset
labels = ('class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy')
# importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)

### random forest

we first use grid sarch with k-fold cross validation (for n_var we choose sqrt(p) and p/3, with p total number variable)

In [None]:
k = 5

# griglia dei parametri su cui fare la ricerca
grid_param = {"n_estimators": (100, 200, 500, 700), 'criterion': ('gini', 'entropy'), 'max_features': ('sqrt', 5)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

rf_cv = GridSearchCV(RandomForestClassifier(), grid_param, cv=k, scoring='balanced_accuracy')
rf_cv.fit(X, y)

In [None]:
results_rf_cv = pd.DataFrame(rf_cv.cv_results_)

display(results_rf_cv)

In [None]:
print(rf_cv.best_score_)
print(rf_cv.best_params_)

we now try with leave-one-out cross validation (almost impossible to run, being so slow)

In [None]:
grid_param = {"n_estimators": (100, 200, 500, 700), 'criterion': ('gini', 'entropy'), 'max_features': ('sqrt', 5)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

rf_loocv = GridSearchCV(RandomForestClassifier(), grid_param, cv=LeaveOneOut(), scoring='accuracy', n_jobs=5)
rf_loocv.fit(X, y)

In [None]:
results_rf_loocv = pd.DataFrame(rf_loocv.cv_results_)

display(results_rf_loocv)

In [None]:
print(rf_loocv.best_score_)
print(rf_loocv.best_params_)

### single tree

grid search with k-fold cross validation

In [None]:
k = 5

# griglia dei parametri su cui fare la ricerca
grid_param = {'criterion': ['gini', 'entropy'], 'min_samples_split': np.arange(2, 20)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

tree_cv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param, cv=k, scoring='balanced_accuracy')
tree_cv.fit(X, y)

In [None]:
results_tree_cv = pd.DataFrame(tree_cv.cv_results_)

display(results_tree_cv)

In [None]:
print(tree_cv.best_score_)
print(tree_cv.best_params_)

grid search with leave-one-out cross validation

In [None]:
grid_param = {'criterion': ('gini', 'entropy'), 'min_samples_split': np.arange(2, 20)}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

tree_loocv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param, cv=LeaveOneOut(), scoring='accuracy')
tree_loocv.fit(X, y)

In [None]:
results_tree_loocv = pd.DataFrame(tree_loocv.cv_results_)

display(results_tree_loocv)

In [None]:
print(tree_loocv.best_score_)
print(tree_loocv.best_params_)

### SVM

In [None]:
# packages for pipelines and scaling
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

grid search with k-fold cross validation: we use various kernels and a wide range for the C parameter; degree indicates the degree of the polynimial used in the kernel (if the kernel is not linear this parameter is ignored); decision_function_shape indicates the way in which the binary classifier technique is adapted to multiclass classification

In [None]:
# building the pipeline
pipe = Pipeline([('scaling', StandardScaler()),
                 ('SVM', svm.SVC(decision_function_shape='ovo'))])

# building the range of the regularization parameter (C) and of gamm
reg_param = np.logspace(-10, 11, 22)
gamm = np.logspace(-9, 3, 13)

grid_param = {'SVM__C': reg_param,
              'SVM__kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
              'SVM__degree': np.arange(2, 5),
              'SVM__decision_function_shape': ('ovo', 'ovr'),
              'SVM__gamma': gamm}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

k=5

svm_cv = GridSearchCV(pipe, grid_param, cv=k, scoring='balanced_accuracy', n_jobs=-2)
svm_cv.fit(X,y)

In [None]:
results_svm_cv = pd.DataFrame(svm_cv.cv_results_)

display(results_svm_cv)

In [None]:
print(svm_cv.best_score_)
print(svm_cv.best_params_)

we now try with leave-one-out cross validation

In [None]:
# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

svm_loocv = GridSearchCV(pipe, grid_param, cv=LeaveOneOut(), scoring='accuracy', n_jobs=-2)
svm_loocv.fit(X,y)

In [None]:
results_svm_loocv = pd.DataFrame(svm_loocv.cv_results_)

display(results_svm_loocv)

In [None]:
print(svm_cv.best_score_)
print(svm_cv.best_params_)

VALUTARE SE METTERE ANCHE GRID SEARCH CON VALORI PIÃ™ SPECIFICI

### naive bayes

in this case we don't use grid search because we don't have any hyper-parameter to choose;
we use instead a k-fold cross validation to evaluate the technique