In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import math
import operator

In [2]:
# Importing data 
data = pd.read_csv("iris.csv")

data.head() 

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [3]:
# Defining a function which calculates euclidean distance between two data points
def euclideanDistance(data1, data2, length):
    distance = 0
    for x in range(length):
        distance += np.square(data1[x] - data2[x])
    return np.sqrt(distance)

In [4]:
# Defining our KNN model
def knn(trainingSet, testInstance, k):
 
    distances = {}
    sort = {} 
    length = testInstance.shape[1]
    
    # Calculating euclidean distance between each row of training data and test data
    for x in range(len(trainingSet)):
        
        dist = euclideanDistance(testInstance, trainingSet.iloc[x], length)

        distances[x] = dist[0]
        # Sorting them on the basis of distance
        sorted_d = sorted(distances.items(), key=operator.itemgetter(1))
        neighbors = []
        
        # Extracting top k neighbors
    for x in range(k):
        neighbors.append(sorted_d[x][0])
    
    classVotes = {}
    # Calculating the most freq class in the neighbors
    for x in range(len(neighbors)):
        response = trainingSet.iloc[neighbors[x]][-1]
 
        if response in classVotes:
            classVotes[response] += 1
        else:
            classVotes[response] = 1
    sortedVotes = sorted(classVotes.items(), key=operator.itemgetter(1), reverse=True)
    return(sortedVotes[0][0], neighbors)

In [5]:
# Creating a dummy testset
testSet = [[7.2, 3.6, 5.1, 2.5]]
test = pd.DataFrame(testSet)

In [7]:
# Setting number of neighbors = 1
k = 1

# Running KNN model
result,neigh = knn(data, test, k)

# Predicted class
print(result)

Iris-setosa


In [8]:
# Nearest neighbor
print(neigh)

[6]


In [9]:
# Now we will try to alter the k values, and see how the prediction changes.
# Setting number of neighbors = 3 
k = 3 
# Running KNN model 
result,neigh = knn(data, test, k) 
# Predicted class 
print(result) 

Iris-setosa


In [10]:
# 3 nearest neighbors
print(neigh)

[6, 7, 5]


In [11]:
# Setting number of neighbors = 5
k = 5
# Running KNN model 
result,neigh = knn(data, test, k) 
# Predicted class 
print(result)

Iris-setosa


In [12]:
# 5 nearest neighbors
print(neigh)

[6, 7, 5, 8, 4]


In [14]:
# Comparing our model with scikit-learn
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(data.iloc[:,0:4], data['Species'])

# Predicted class
print(neigh.predict(test))

# 3 nearest neighbors
print(neigh.kneighbors(test)[1])

['Iris-setosa']
[[6 7 5]]


We can see that both the models predicted the same class (‘Iris-setosa’) and the same nearest neighbors ( [6，7，5] ). Hence we can conclude that our model runs as expected.