In [4]:
import numpy as np
import pandas as pd
df = pd.read_csv('TrainingSet.csv')

In [5]:
y_train = df['plant'].values
x_train = df[['leaf.length', 'leaf.width', 'flower.length', 'flower.width']].values

# Creating method of KNN

In [6]:
from scipy.stats import mode
 
#Euclidean Distance
def eucledian(p1,p2):
    dist = np.sqrt(np.sum((p1-p2)**2))
    return dist
#Function to calculate KNN
def predict(x_train, y , x_input, k):
    op_labels = []
    #Loop through the Datapoints to be classified
    for item in x_input: 
        #Array to store distances
        point_dist = []
        #Loop through each training Data
        for j in range(len(x_train)): 
            distances = eucledian(np.array(x_train[j,:]) , item) 
            #Calculating the distance
            point_dist.append(distances) 
        point_dist = np.array(point_dist) 
        #Sorting the array while preserving the index
        #Keeping the first K datapoints
        dist = np.argsort(point_dist)[:k] 
        #Labels of the K datapoints from above
        labels = y[dist]
         
        #Majority voting
        lab = mode(labels) 
        lab = lab.mode[0]
        op_labels.append(lab)
 
    return op_labels

In [7]:
test_data = pd.read_csv('TestSet1.csv')
x_test = test_data[['leaf.length', 'leaf.width', 'flower.length', 'flower.width']].values

# Predicting from self-made method and Scikit Learn

In [8]:
#Applying our function 
from sklearn.neighbors import KNeighborsClassifier
import time

k_values = [3,5,7]
prediction = []
times = {}

for i in k_values:
    #predictions from self made formula
    start_time = time.time()
    y_test = predict(x_train,y_train,x_test , i)
    times['Time for k = ' + str(i) + ' (Self)'] = (time.time() - start_time)
    
    prediction.append(y_test)
    
    
    #predictions from scikit learn
    classifier = KNeighborsClassifier(n_neighbors=i)
    start_time = time.time()
    classifier.fit(x_train, y_train)
    y_pred = classifier.predict(x_test)
    times['Time for k = ' + str(i) + ' (Scikit)'] = (time.time() - start_time)
    
    prediction.append(list(y_pred))

In [9]:
result = pd.concat([pd.DataFrame(x_test), pd.DataFrame(prediction[0]), pd.DataFrame(prediction[1]), pd.DataFrame(prediction[2]), pd.DataFrame(prediction[3]), pd.DataFrame(prediction[4]), pd.DataFrame(prediction[5])], axis=1)
result.columns = ['leaf.length', 'leaf.width', 'flower.length', 'flower.width','k3','k3_scikit','k5','k5_scikit','k7','k7_scikit']
result

Unnamed: 0,leaf.length,leaf.width,flower.length,flower.width,k3,k3_scikit,k5,k5_scikit,k7,k7_scikit
0,4.4,2.9,1.4,0.2,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
1,4.6,3.1,1.5,0.2,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
2,4.6,3.4,1.4,0.3,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
3,4.7,3.2,1.3,0.2,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
4,4.9,3.0,1.4,0.2,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
5,4.9,3.1,1.5,0.1,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
6,4.9,2.4,3.3,1.0,Harlequin,Harlequin,Harlequin,Harlequin,Harlequin,Harlequin
7,4.9,2.5,4.5,1.7,Harlequin,Harlequin,Harlequin,Harlequin,Harlequin,Harlequin
8,5.0,3.6,1.4,0.2,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica
9,5.0,3.4,1.5,0.2,Arctica,Arctica,Arctica,Arctica,Arctica,Arctica


# Time for prediction by both methods

In [10]:
times

{'Time for k = 3 (Self)': 0.04487943649291992,
 'Time for k = 3 (Scikit)': 0.006009578704833984,
 'Time for k = 5 (Self)': 0.04388284683227539,
 'Time for k = 5 (Scikit)': 0.0019953250885009766,
 'Time for k = 7 (Self)': 0.04189634323120117,
 'Time for k = 7 (Scikit)': 0.002991199493408203}

# Check Predictions

In [11]:
precentage = (sum(((result['k3'] == result['k3_scikit']) == (result['k5'] == result['k5_scikit'])) == (result['k7'] == result['k7_scikit']))/30)*100

str(precentage) + '% of entries are equal'

'100.0% of entries are equal'

In [12]:
result.to_csv('result.csv', index=False)