In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as st
import numpy as np
import warnings
warnings.filterwarnings('ignore')

# Handle data

In [16]:
read_file = pd.read_csv (r'/iris.data.txt')
read_file.to_csv (r'/iris_data.csv', index=None)

In [15]:
df = pd.read_csv("/iris.data.txt", header = None)
df.columns =  ["sep_len", "sep_wid", "pet_len", "pet_wid", "category"]

In [17]:
df.head()


Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,category
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


# Data preprocessing 

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

df["category"] = le.fit_transform(df["category"])

In [18]:
df.head()

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,category
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [25]:
features = df.columns.difference(["category"])

for column in features:
    mu = df[column].mean()
    sigma = df[column].std()

    df[column] = (df[column] - mu)/sigma

In [26]:
df.head()

Unnamed: 0,sep_len,sep_wid,pet_len,pet_wid,category
0,-0.897674,1.028611,-1.336794,-1.308593,Iris-setosa
1,-1.1392,-0.12454,-1.336794,-1.308593,Iris-setosa
2,-1.380727,0.33672,-1.39347,-1.308593,Iris-setosa
3,-1.50149,0.10609,-1.280118,-1.308593,Iris-setosa
4,-1.018437,1.259242,-1.336794,-1.308593,Iris-setosa


In [27]:
from sklearn.model_selection import train_test_split

# features extraction
y = df['category']
x = df.drop(['category'], axis = 1)


x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.2, random_state=0)

# Grouping all the function in one main class 

In [34]:
from scipy.spatial import distance

class Knn:

  def __init__(self, k, distance_metric = "euclidian"):
    self.k = k
    self.distance_metric = distance_metric


  def fit(self, x_train, y_train):
    # Define the training data 
    self.x_train = x_train
    self.y_train = y_train


  def predict(self, X):
    predictions = []
    
    for i in range(len(X)):
        neighbors = self.__getNeighbors(self.x_train, self.y_train, X.iloc[i], self.k)
        result = self.__getResponse(neighbors)
        predictions.append(result)

    return predictions

  def __Distance(self, instance1, instance2):

    #calculate distance
    length = df.iloc[0].size - 1
    
    # ignoring last feature
    instance1 = instance1[:length]
    instance2 = instance2[:length]

    if self.distance_metric == "euclidian":
        return distance.euclidean(instance1, instance2)
    elif self.distance_metric == "manhattan":
        return distance.cityblock(instance1, instance2)

  def __getNeighbors(self, x_train, y_train, inst, k):

    # k nearest neighbours with their categories 

    k_nearest = {}

    for idx in range(len(x_train)):

        row = x_train.iloc[idx]
        dist = self.__Distance(inst, row)

        if len(k_nearest) < k:
            k_nearest[dist] = y_train.iloc[idx]

        elif dist < max(k_nearest.keys()):
            del k_nearest[max(k_nearest.keys())]
            k_nearest[dist] = y_train.iloc[idx]

    return k_nearest

  def __getResponse(self, neighbors):

    votes = {}

    for i in neighbors:
        vote = neighbors[i]
        votes[vote] = votes.get(vote, 0) + 1

    return max(votes, key = lambda x: votes[x])

  
  def getAccuracy(self, predictions, y_test):

    x = 0
    for i in range(len(y_test)):
        if y_test.iloc[i] == predictions[i]:
            x += 1
    return (x/float(len(y_test)))

# Main

In [39]:
print("\n\nFunction : euclidian" )
print ("K values : ")
max_acc = 0
  # k had better to be odd
for k in range(1, 6, 2):

    print(k)
    k_c = Knn(k = k, distance_metric = "manhattan")
    k_c.fit(x_train, y_train)
    predictions = k_c.predict(x_test)
    acc = k_c.getAccuracy(predictions, y_test)

    if acc > max_acc:
        max_acc = acc
        best_k = k
print("Best outcome : ")
print("  k =", best_k)
print("  accuracy =", max_acc)



Function : euclidian
K values : 
1
3
5
Best outcome : 
  k = 5
  accuracy = 1.0


# Another distance metric : The manhattan function



In [48]:
print("Function : manhattan" )
print ("K values : ")
max_acc = 0
  # k had better to be odd
for k in range(1, 6, 2):

    print(k)
    k_c = Knn(k = k, distance_metric = "manhattan")
    k_c.fit(x_train, y_train)
    predictions = k_c.predict(x_test)
    acc = k_c.getAccuracy(predictions, y_test)

    if acc > max_acc:
        max_acc = acc
        best_k = k
print("Best outcome : ")
print(" k =", best_k)
print(" accuracy =", max_acc)

Function : manhattan
K values : 
1
3
5
Best outcome : 
 k = 5
 accuracy = 1.0
