## Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import LeaveOneOut
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsClassifier

## Importing & Looking at data


In [2]:
df=pd.read_csv('fruit_data_with_colours.csv')

In [3]:
df.head(5)

Unnamed: 0,fruit_label,fruit_name,fruit_subtype,mass,width,height,color_score
0,1,apple,granny_smith,192,8.4,7.3,0.55
1,1,apple,granny_smith,180,8.0,6.8,0.59
2,1,apple,granny_smith,176,7.4,7.2,0.6
3,2,mandarin,mandarin,86,6.2,4.7,0.8
4,2,mandarin,mandarin,84,6.0,4.6,0.79


In [4]:
df.shape

(59, 7)

In [5]:
df['fruit_name'].unique()

array(['apple', 'mandarin', 'orange', 'lemon'], dtype=object)

### - the numerical columns are not on the same scale, and since we're applying KNN, which is sensitive to data scales, we'll need to scale the data later.

## Splitting the data to test and training data

In [6]:
x = df.drop(['fruit_label', 'fruit_name','fruit_subtype'], axis =1)
y = df['fruit_label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3,random_state = 42)
x_train.head(3)

Unnamed: 0,mass,width,height,color_score
54,116,6.1,8.5,0.71
19,162,7.4,7.2,0.85
30,158,7.1,7.5,0.79


In [7]:
x_train.shape, x_test.shape

((41, 4), (18, 4))

# Implementing KNN using Leave-One-Out Cross Validation

In [8]:
def knn(k: int):
    cv = LeaveOneOut()
    accuracy = []
    
    for i in range(1, k+1):
        clf = KNeighborsClassifier(n_neighbors=i+1,weights='uniform', p=2, metric_params=None)
        scores = cross_val_score(clf, x_train, y_train, cv=cv)
        accuracy.append(scores.mean())
        print('K =' , i,'','score =', "%.2f" % (scores.mean()*100), '\n')
        
    for i in range(k):
        if accuracy[i] == np.max(accuracy):
            print('Best K value = ',i+1)

### Searching for the best K from 1 to 10:

In [9]:
knn(10)

K = 1  score = 60.98 

K = 2  score = 73.17 

K = 3  score = 56.10 

K = 4  score = 36.59 

K = 5  score = 51.22 

K = 6  score = 46.34 

K = 7  score = 48.78 

K = 8  score = 63.41 

K = 9  score = 53.66 

K = 10  score = 53.66 

Best K value =  2


## Best accuracy with LOOCV  = 73%

## Now we'll try implementing KNN with KFold cv.
### First, we'll search for the best K for KFold using 2 neighbours KNN  



In [10]:
from sklearn.model_selection import KFold

In [11]:
def kFold(k: int, end: int):
    clf = KNeighborsClassifier(n_neighbors = 2)
    accuracy = []
    K= []
    for i in range(k, end+1):
        cv = KFold(n_splits=i)
        scores = cross_val_score(clf, x_train, y_train, cv=cv)
        accuracy.append(scores.mean())
        K.append(i)
        print("K= ", i, '', "Score= ", "%.2f" % (scores.mean()*100), '\n' ) 
        
             
    for i in range(len(K)):
        if accuracy[i] == np.max(accuracy):
            print('Best K value = ',K[i])
    

In [12]:
kFold(2,15)

K=  2  Score=  58.81 

K=  3  Score=  58.79 

K=  4  Score=  58.18 

K=  5  Score=  53.33 

K=  6  Score=  48.41 

K=  7  Score=  48.57 

K=  8  Score=  50.83 

K=  9  Score=  50.56 

K=  10  Score=  53.50 

K=  11  Score=  49.24 

K=  12  Score=  48.61 

K=  13  Score=  55.13 

K=  14  Score=  53.57 

K=  15  Score=  53.33 

Best K value =  2


### We've got 58.81% accuracy on KNN using KFold on unscaled data, thus, LOOCV performs better on our dataset

### Now, we'll scale the data and see how scaling it will affect the accuracy.


In [13]:
from sklearn.preprocessing import StandardScaler

In [14]:
sc = StandardScaler()
x_train_scaled = pd.DataFrame(sc.fit_transform(x_train))
x_test_scaled =  pd.DataFrame(sc.transform(x_test))
x_train_scaled.columns = ['mass', 'width', 'height', 'color_score']
x_train_scaled.head(2)

Unnamed: 0,mass,width,height,color_score
0,-0.983318,-1.411474,0.577887,-0.716149
1,-0.068852,0.321972,-0.558695,1.118982


In [15]:
clf = KNeighborsClassifier(n_neighbors = 2)
cv = LeaveOneOut()
scores = cross_val_score(clf, x_test_scaled, y_test, cv=cv)

In [16]:
accuracy = (scores.mean()*100)
print("%.2f" % accuracy)

83.33


### Accuracy improved from 73.17% to 83.33%

# Conclusion: 
## 1 - Leave-One-Out performs better than KFolds CV on small datasets.
## 2 - Algorithms work better on scaled data