In [2]:
import pandas as pd
import numpy as np

from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import LeaveOneOut

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = 'all'


# setting up labels for dataset
labels = ('class', 'spec_num', 'eccentr', 'asp_ratio', 'elong', 'solidity', 'stoch_conv', 'iso_factor',
          'max_ind_depth', 'lobedness', 'av_intensity', 'av_contr', 'smooth', 'third_mom', 'unif',
          'entropy')
# importing data
df = pd.read_csv(r'./leaf/leaf.csv', header = None, names = labels)
rownum = df.shape[0]

k-fold cross validation: we use all the possible values of the parameter corresponding to the number of neighbors k

In [3]:
k = 5   # number of folds

# griglia dei parametri su cui fare la ricerca
grid_param_cv = {   'n_neighbors': np.arange(1, rownum -rownum//k), 
                    'weights': ('uniform', 'distance'), 
                    'metric': ('cosine', 'euclidean', 'manhattan')}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

knn_cv = GridSearchCV(KNeighborsClassifier(), grid_param_cv, cv=k, scoring='balanced_accuracy', return_train_score=False, verbose=1)
knn_cv.fit(X, y)

Fitting 5 folds for each of 1626 candidates, totalling 8130 fits


In [None]:
print(knn_cv.best_score_)
print(knn_cv.best_params_)

leave-one-out cross validation (less parameters to make it less slow)

In [None]:
# griglia dei parametri su cui fare la ricerca
grid_param_less = { 'n_neighbors': np.arange(1, 30), 
                    'weights': ('uniform', 'distance'), 
                    'metric': ('cosine', 'euclidean', 'manhattan')}

# shuffling the dataframe
df = df.sample(frac=1).reset_index()
df = df.iloc[:, 1:17]   # needed to eliminate the old indexes column
# separating y from x and eliminating specimen number variable
X = df.iloc[:, 2:16]
y = df.iloc[:, 0]

knn_loocv = GridSearchCV(KNeighborsClassifier(), grid_param_less, cv=LeaveOneOut(), scoring='balanced_accuracy', return_train_score=False, n_jobs=-2)
knn_loocv.fit(X, y)

In [None]:
print(knn_loocv.best_score_)
print(knn_loocv.best_params_)