In [3]:
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt

In [4]:
df = pd.read_csv('iris.csv')

**Investigating the dataset**

In [5]:
df.head(10)

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,Setosa
1,4.9,3.0,1.4,0.2,Setosa
2,4.7,3.2,1.3,0.2,Setosa
3,4.6,3.1,1.5,0.2,Setosa
4,5.0,3.6,1.4,0.2,Setosa
5,5.4,3.9,1.7,0.4,Setosa
6,4.6,3.4,1.4,0.3,Setosa
7,5.0,3.4,1.5,0.2,Setosa
8,4.4,2.9,1.4,0.2,Setosa
9,4.9,3.1,1.5,0.1,Setosa


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
sepal.length    150 non-null float64
sepal.width     150 non-null float64
petal.length    150 non-null float64
petal.width     150 non-null float64
variety         150 non-null object
dtypes: float64(4), object(1)
memory usage: 5.3+ KB


**replacing catagorical data**

In [7]:
df_copy = df.copy()

In [8]:
replace_cat_data = {'variety':{'Setosa':1,'Versicolor':2,'Virginica':3}}

In [9]:
df_copy.replace(replace_cat_data,inplace=True)

In [10]:
df_copy.head()

Unnamed: 0,sepal.length,sepal.width,petal.length,petal.width,variety
0,5.1,3.5,1.4,0.2,1
1,4.9,3.0,1.4,0.2,1
2,4.7,3.2,1.3,0.2,1
3,4.6,3.1,1.5,0.2,1
4,5.0,3.6,1.4,0.2,1


In [11]:
data = df_copy.values #creating a np matrix with dataframe

**splitting the data set into two part(training set and test set),split ratio is given by the user(in this example i am using 67% split ratio)**

In [12]:
def loadDataSet(training_data,test_data,split):
    for i in range(len(data)):
        dataset = data[i,:]
        if random.random()<split:
            training_data.append(dataset)
        else:
            test_data.append(dataset)

In [13]:
#testing function
split_ratio=0.67
training_data = []
test_data = []

loadDataSet(training_data,test_data,split_ratio)
training_set = np.asarray(training_data)
test = np.asarray(test_data)
print(training_set.shape)
print(test.shape)

(91, 5)
(59, 5)


**calculating the distance between two sample**

In [14]:
def getdistance(data1,data2):
    return np.linalg.norm(data1[:,0:4]-data2[:,0:4])

In [15]:
#testing the function
a = training_set[[1],:]
b = test[[2],:]
getdistance(a,b)

0.264575131106459

* calculate the distance between the training samples and test sample
* store the training data along with the distance into list
* sort the list using distance as a parameter (ascending order)
* return only those neighbour who has the smallest from the test sample

In [16]:
import pdb
from operator import itemgetter
def getNeighbour(dataset,instance,k):
    distance = []
    for i in range(len(dataset)):
        data = dataset[[i],:]
        dist = getdistance(data,instance)
        distance.append((data,dist))
    #pdb.set_trace()
    distance.sort(key = itemgetter(1))
    neighbour= []
    for j in range(k):
        neighbour.append(distance[j][0])
    
    return np.asarray(neighbour).reshape(k,5)

In [17]:
#testing function
#print(len(test))
c = test[[10],:]
#print(len(test)
print(c)
getNeighbour(training_set,c,3)



[[4.7 3.2 1.6 0.2 1. ]]


array([[4.8, 3.1, 1.6, 0.2, 1. ],
       [4.6, 3.1, 1.5, 0.2, 1. ],
       [4.8, 3.4, 1.6, 0.2, 1. ]])

* create a dictionary to count the number of vote each class get 
* sort the vote dictionary according to the highest vote
* return highest vote class

In [18]:
def getResponse(neighbour):
    vote = {1:0,2:0,3:0}
    for i in range(len(neighbour)):
        flower_label = neighbour[i,4]
        vote[flower_label] += 1
    sort_vote = sorted(vote.items(),key = itemgetter(1),reverse = True)
    return sort_vote[0][0]

In [19]:
#testing function
a = training_set[1:10,:]
c = data[[1],:]
x = getNeighbour(a,c,3)
getResponse(x)

1

**measure the accuracy of the classifier**

In [20]:
def getAccuracy(prediction,test):
    result = 0
    for i in range(len(test)):
        if prediction[i]==int(test[[i],4]):
            result+=1
    
    return (result/len(test))*100
    

In [29]:
import pdb
def main():
    #testing function
    split_ratio=0.67
    training_data = []
    test_data = []
    loadDataSet(training_data,test_data,split_ratio)
    training_set = np.asarray(training_data)
    #pdb.set_trace()
    test = np.asarray(test_data)
    k=3
    print("Training set "+str(len(training_set)))
    print("Test set "+str(len(test)))
    label = {1:'Setosa',2:'Virginica',3:'Versicolor'}
    prediction = []
    for i in range(len(test)):
        #pdb.set_trace()
        neighbour = getNeighbour(training_set,test[[i],:],k)
        result = getResponse(neighbour)
        prediction.append(result)
        actual = int(test[[i],4])
        #print('prediction '+ label[result] + ' actual '+ label[actual])
    #pdb.set_trace()
    accuracy = getAccuracy(prediction,test)
    print("\nAccuracy of the classifier is "+str(accuracy)+"%")

main()    

Training set 104
Test set 46

Accuracy of the classifier is 95.65217391304348%
