In [16]:
%matplotlib inline
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import math
from collections import Counter


In [17]:
#Acquire and View the Data
url = "https://archive.ics.uci.edu/ml/machine-learning-databases/iris/iris.data"

df = pd.read_csv(url,header = None)

df.columns = ['sepal-length','sepal-width','petal-length','petal-width','type']
display(df.head())

Unnamed: 0,sepal-length,sepal-width,petal-length,petal-width,type
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [18]:
#Seperate the data randomly for testing

train,test = train_test_split(df,shuffle = True,test_size=.2)

In [19]:
#Create a distance function that generates a list of distances to each existing point

def distance(testrow,data,k):
    distance = {}
    
    #Create a dictionary of distances 
    for index,row in data.iterrows():
        distance[index] =  math.sqrt((row['sepal-length'] - testrow['sepal-length'])**2 + (row['sepal-width'] - testrow['sepal-width'])**2 + (row['petal-length'] - testrow['petal-length'])**2 + (row['petal-width'] - testrow['petal-width'])**2)

        
    #Sort the dictionary by the values(distances) and select the first k elements  
    sorted_dist = [(k, distance[k]) for k in sorted(distance, key=distance.get, reverse=False)]
    sorted_dict = dict(sorted_dist[0:k])
    

    #Get the flower type of each nearest neighbor
    flower_type = []
    for key in sorted_dict:
        flower_type.append(df.loc[key]['type'])
     
    #Get the frequencies of the different types of flowers
    counter = Counter(flower_type)
    freq_list = sorted(counter.values())
  
    #Return the String with the greatest frequency. If there is a tie, recursively decrease k until no such tie is present
    if len(freq_list) == 1:
        return counter.most_common()[0][0]
    elif freq_list[0] == freq_list[1]:
        return distance(testrow,data,k-1)
    else:
        return counter.most_common()[0][0]

In [20]:
predicted_vals = []
for i in range(len(test)):
    testrow = test.iloc[[i]]
    dist = distance(testrow,train,5)
    predicted_vals.append(dist)
    
#Get the percent accuracy
percent = 0
for idx,i in enumerate(predicted_vals):
    if i == test['type'].iloc[idx]:
        percent = percent + 1;
      
print('Percent Accuracy of model made from scratch: ')
print(percent/len(test) * 100)


Percent Accuracy of model made from scratch: 
100.0


In [21]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(train.iloc[:,0:4], train['type'])



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')

In [22]:
pred = knn.score(test.iloc[:,0:4],test.iloc[:,4:])
print('Percent accuracy of Sklearn Model: %s' %(pred))

Percent accuracy of Sklearn Model: 1.0


In [None]:
#Here, we get a 100% fit, but it really depends on the random sample that is trained. More data is needed.