## Grafici

In [None]:
# for the dataset
import pandas as pd    
import numpy as np

# general
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_validate
# from sklearn.model_selection import train_test_split
# from sklearn.metrics import accuracy_score
import time

# for the tree
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

# for the Random forest
from sklearn.ensemble import RandomForestClassifier

# for the SVM + packages for pipelines and scaling
from sklearn import svm
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

# for the Gaussian
from sklearn.naive_bayes import GaussianNB

# for KNN
from sklearn.neighbors import KNeighborsClassifier

# to print more results and not just the last one
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'

In [None]:
#Dataset

# setting up labels for dataset
labels = ('class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor', 'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif', 'entropy')

# importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)

In [None]:
# for training and testing static division, if needed
# X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=100)

### Single tree

In [None]:
# NON SERVE RUNNARE
# ho considerato i parametri riportati nell'ultima riga

#Grid Search - finding best estimators

# shuffling the dataframe + separating y from x + eliminating specimen number variable
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# griglia dei parametri su cui fare la ricerca
# for n-min in proglie c'è da 2 a 40, ma forse 40 è troppo alto visto che le osservazioni sono meno di 400
grid_param = {'criterion': ['gini', 'entropy'], 'min_samples_split': np.arange(2, 20)}

tree_cv = GridSearchCV(tree.DecisionTreeClassifier(), grid_param, cv=8, scoring='balanced_accuracy')
tree_cv.fit(X, y)
print(tree_cv.best_score_)   #result: 0.6791666666666667
print(tree_cv.best_params_)  #result: {'criterion': 'entropy', 'min_samples_split': 4}

In [None]:
# Dati per grafico Single Tree - tentativo 1

TREE = []

for i in range(50):
    # shuffling the dataframe
    df = df.sample(frac=1).reset_index()
    df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
    # separating y from x and eliminating specimen number variable
    X = df.iloc[:, 2:16]
    y = df.iloc[:, 0]
    clf_T1 = cross_validate(DecisionTreeClassifier(criterion = "entropy", min_samples_split = 4), X, y, cv=8, scoring = "balanced_accuracy")
    scores_tree = np.mean(clf_T1["test_score"])
    TREE.append(scores_tree)
    
print(TREE)
print(np.mean(TREE))

In [None]:
start_time_tree = time.time()
cross_validate(DecisionTreeClassifier(criterion = "entropy", min_samples_split = 4), X, y, cv=8, scoring = "balanced_accuracy")
stop_time_tree = time.time()
print("Process finished in %s seconds" % (stop_time_tree - start_time_tree))

### Random forest

In [None]:
# NON SERVE RUNNARE
# ho considerato i parametri riportati nell'ultima riga

# Grid Search - finding best estimators

# shuffling the dataframe + separating y from x + eliminating specimen number variable
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# griglia dei parametri su cui fare la ricerca
grid_param = {"n_estimators": (100, 200, 500, 700, 900), 'criterion': ('gini', 'entropy')}

rf_cv = GridSearchCV(RandomForestClassifier(max_features=4), grid_param, cv=8, scoring='balanced_accuracy', n_jobs=5)
rf_cv.fit(X, y)
print(rf_cv.best_score_)    #result: 0.8145833333333333
print(rf_cv.best_params_)   #result: {'criterion': 'gini', 'n_estimators': 200}

In [None]:
# Dati per grafico Random Forest - tentativo 1

RAN_FOR = []

for i in range(50):
    # shuffling the dataframe
    df = df.sample(frac=1).reset_index()
    df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
    # separating y from x and eliminating specimen number variable
    X = df.iloc[:, 2:16]
    y = df.iloc[:, 0]
    clf_RF1 = cross_validate(RandomForestClassifier(n_estimators = 200, criterion = "gini", max_features=4), X, y, cv=8, scoring = "balanced_accuracy", n_jobs = 5)
    scores_RF = np.mean(clf_RF1["test_score"])
    RAN_FOR.append(scores_RF)
print(RAN_FOR)
print(np.mean(RAN_FOR))

In [None]:
#timing

start_time_rf = time.time()
cross_validate(RandomForestClassifier(n_estimators = 200, criterion = "gini", max_features= "sqrt"), X, y, cv=8, scoring = "balanced_accuracy", n_jobs = 5)
stop_time_rf = time.time()
print("Process finished in %s seconds" % (stop_time_rf - start_time_rf))

In [None]:
#dati per grafico Random Forest - tentativo 2

#RF=[]

#for i in range(1, 6):
    #clf_RF2 = RandomForestClassifier(n_estimators = 700, criterion = "entropy", max_features = 5)
    #clf_RF2.fit(X_train,y_train)
    #predictions = clf_RF2.predict(X_test)
    #RF.append(accuracy_score(y_test, predictions))

#print(RF)

### SVM

In [None]:
# NON SERVE RUNNARE
# ho considerato i parametri riportati nell'ultima riga

# Grid Search - finding best estimators

# building the pipeline
pipe = Pipeline([('scaling', StandardScaler()),
                 ('SVM', svm.SVC(decision_function_shape='ovo'))])

# building the range of the regularization parameter (C) and of gamm
reg_param = np.logspace(-10, 11, 22)
gamm = np.logspace(-9, 3, 13)

grid_param = {'SVM__C': reg_param,
              'SVM__kernel': ('linear', 'poly', 'rbf', 'sigmoid'), 
              'SVM__degree': np.arange(2, 5),
              'SVM__decision_function_shape': ('ovo', 'ovr'),
              'SVM__gamma': gamm}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

svm_cv = GridSearchCV(pipe, grid_param, cv=8, scoring='balanced_accuracy', n_jobs=5)
svm_cv.fit(X,y)

print(svm_cv.best_score_)  # result: 0.8104166666666666
print(svm_cv.best_params_) # result: {'SVM__C': 100000.0, 'SVM__decision_function_shape': 'ovo', 'SVM__degree': 2, 'SVM__gamma': 0.0001, 'SVM__kernel': 'rbf'}

In [None]:
# Dati per grafico SVM

SVM = []

pipe = Pipeline([('scaling', StandardScaler()),
                 ('SVM', svm.SVC( C= 100000.0, decision_function_shape = "ovo", degree = 2, gamma = 0.0001, kernel = "rbf"))])

for i in range(50):
    # shuffling the dataframe
    df = df.sample(frac=1).reset_index()
    df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
    # separating y from x and eliminating specimen number variable
    X = df.iloc[:, 2:16]
    y = df.iloc[:, 0]
    clf_SVM1 = cross_validate(pipe, X, y, cv=8, scoring = "balanced_accuracy")
    scores_SVM1 = np.mean(clf_SVM1["test_score"])
    SVM.append(scores_SVM1)

print(SVM)
print(np.mean(SVM))

In [None]:
#timing

start_time_svm = time.time()
cross_validate(pipe, X, y, cv=8, scoring = "balanced_accuracy")
stop_time_svm = time.time()
print("Process finished in %s seconds" % (stop_time_svm - start_time_svm))

### Naive bayes

In [None]:
# No grid search since we don't have any hyper-parameters to choose

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

In [None]:
# Dati per grafico Naive Bayes

# k-fold cross validation

NB = []

for i in range(50):
    # shuffling the dataframe
    df = df.sample(frac=1).reset_index()
    df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
    # separating y from x and eliminating specimen number variable
    X = df.iloc[:, 2:16]
    y = df.iloc[:, 0]
    effect_NB_cv = cross_validate(GaussianNB(), X, y, cv= 8, scoring='balanced_accuracy')
    scores_NB1 = np.mean(effect_NB_cv["test_score"])
    NB.append(scores_NB1)
    
print(NB)
print(np.mean(NB))

In [None]:
#timing

start_time_nb = time.time()
cross_validate(GaussianNB(), X, y, cv= 8, scoring='balanced_accuracy')
stop_time_nb = time.time()
print("Process finished in %s seconds" % (stop_time_nb - start_time_nb))


### KNN

In [None]:
# NON SERVE RUNNARE
# ho considerato i parametri riportati nell'ultima riga

# Grid Search - finding best estimators

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

# griglia dei parametri su cui fare la ricerca
grid_param_cv = {'n_neighbors': np.arange(1, 340-340//8), 'metric': ('cosine', 'euclidean', 'manhattan')}

knn_cv = GridSearchCV(KNeighborsClassifier(), grid_param_cv, cv= 8, scoring='balanced_accuracy', return_train_score=False, verbose=0, n_jobs= 5)
knn_cv.fit(X, y)
print(knn_cv.best_score_)  # result: 0.6354166666666666
print(knn_cv.best_params_) # result: {'n_neighbors': 5, 'metric': manhattan}

In [None]:
# Dati per grafico KNN

KNN = []

for i in range(50):
    # shuffling the dataframe
    df = df.sample(frac=1).reset_index()
    df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
    # separating y from x and eliminating specimen number variable
    X = df.iloc[:, 2:16]
    y = df.iloc[:, 0]
    clf_KNN1 = cross_validate(KNeighborsClassifier(n_neighbors = 5, metric = 'manhattan'), X, y, cv=8, scoring = "balanced_accuracy", n_jobs = 5)
    scores_KNN1 = np.mean(clf_KNN1["test_score"])
    KNN.append(scores_KNN1)
print(KNN)
print(np.mean(KNN))

In [None]:
#timing

start_time_knn = time.time()
cross_validate(KNeighborsClassifier( n_neighbors = 5, weights = "distance"), X, y, cv=8, scoring = "balanced_accuracy", n_jobs = 5)
stop_time_knn = time.time()
print("Process finished in %s seconds" % (stop_time_knn - start_time_knn))


## Graphs

In [None]:
import matplotlib.pyplot as plt

# for fancier plots
import seaborn as sns
sns.set_theme()
sns.set(rc={"axes.facecolor": "#eee7e5", "figure.facecolor": "eee7e5"})

scores_from_loop = [TREE, RAN_FOR, SVM, NB, KNN]
labels = ["Tree", "Random Forest", "SVM", "Naive Bayes", "KNN"]
colors = ['grey', 'blue', 'green', 'red', 'orange']
colors2 = ['grey', 'grey', 'blue', 'blue', 'green', 'green', 'red', 'red', 'orange', 'orange']

# Creating plot
fig, (ax) = plt.subplots(figsize=(8, 6), edgecolor= "blue")

bplot = ax.boxplot(scores_from_loop, 
                   vert=True,  # vertical box alignment
                   patch_artist=False,  # fill with color
                   labels=labels)  # will be used to label x-ticks
                   
dict_title = {'fontsize': 20, 'fontweight': 'bold'}
#ax.set_title('Weighted accuracy', fontdict=dict_title)
ax.set_xticklabels(labels=labels, rotation = 45, fontsize=13)
ax.set_ylabel('weighted accuracy')

for element in ['boxes', 'fliers', 'means', 'medians']:
    for patch, color in zip(bplot[element], colors):
        patch.set_color(color)

for element in ('whiskers', 'caps'):
    for patch, color in zip(bplot[element], colors2):
        patch.set_color(color)

# saving plot
plt.savefig('boxplot.png')

# show plot
plt.show()