### Importing Necessary Libraries

In [1]:
import numpy as np
import pandas as pd
import math

### Working With Dataset

In [2]:
df = pd.read_csv('Iris.csv')

In [3]:
df.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,Iris-setosa
1,2,4.9,3.0,1.4,0.2,Iris-setosa
2,3,4.7,3.2,1.3,0.2,Iris-setosa
3,4,4.6,3.1,1.5,0.2,Iris-setosa
4,5,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
Iris-setosa,50
Iris-versicolor,50
Iris-virginica,50


In [5]:
# Iris-setosa	= 0
# Iris-versicolor	= 1
# Iris-virginica = 2

df['Species'] = df['Species'].map({"Iris-setosa":'0',"Iris-versicolor":'1',"Iris-virginica":'2'})
df['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
0,50
1,50
2,50


In [6]:
df0 = df[df['Species'] == '0']
df1 = df[df['Species'] == '1']
df2 = df[df['Species'] == '2']

In [7]:
df0.head()

Unnamed: 0,Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
0,1,5.1,3.5,1.4,0.2,0
1,2,4.9,3.0,1.4,0.2,0
2,3,4.7,3.2,1.3,0.2,0
3,4,4.6,3.1,1.5,0.2,0
4,5,5.0,3.6,1.4,0.2,0


### Splitting into test and train data

In [8]:
train_frames = (df0.iloc[:40, :], df1.iloc[:40, :], df2.iloc[:40, :])
test_frames = (df0.iloc[40:, :], df1.iloc[40:, :], df2.iloc[40:, :])

train_df = pd.concat(train_frames)
test_df = pd.concat(test_frames)

In [9]:
train_df['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
0,40
1,40
2,40


In [10]:
test_df['Species'].value_counts()

Unnamed: 0_level_0,count
Species,Unnamed: 1_level_1
0,10
1,10
2,10


In [11]:
# train_df = train_df.drop('Species', axis='columns')
# test_df = test_df.drop('Species', axis='columns')
train_df = train_df.drop('Id', axis='columns')
test_df = test_df.drop('Id', axis='columns')

In [12]:
## Exporing the created dataframes
# train_df.to_csv('train_df.csv', index=False)
# test_df.to_csv('test_df.csv', index=False)

In [13]:
test_df.head()

Unnamed: 0,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
40,5.0,3.5,1.3,0.3,0
41,4.5,2.3,1.3,0.3,0
42,4.4,3.2,1.3,0.2,0
43,5.0,3.5,1.6,0.6,0
44,5.1,3.8,1.9,0.4,0


### Calculating Distance Between Each Test Sample To Each Train Sample

In [14]:
train_arr = train_df.values
test_arr = test_df.values

In [15]:
# Euclidean Distance
def euclidean_distance(train_arr, test_arr):
  mat = np.zeros((30, 120))
  for i in range(30):
    for j in range(120):
      diff1 = test_arr[i][0] - train_arr[j][0]
      diff2 = test_arr[i][1] - train_arr[j][1]
      diff3 = test_arr[i][2] - train_arr[j][2]
      diff4 = test_arr[i][3] - train_arr[j][3]

      dist = math.sqrt(((diff1)**2) + ((diff2)**2) + ((diff3)**2) + ((diff4)**2))
      dist = round(dist, 2)
      mat[i,j] = dist

  return mat;

euclidean_mat = euclidean_distance(train_arr, test_arr)
euclidean_mat[0]

array([0.17, 0.53, 0.44, 0.61, 0.17, 0.7 , 0.42, 0.24, 0.86, 0.5 , 0.5 ,
       0.39, 0.58, 0.91, 0.95, 1.16, 0.57, 0.14, 0.86, 0.37, 0.58, 0.32,
       0.52, 0.5 , 0.65, 0.59, 0.33, 0.3 , 0.26, 0.53, 0.55, 0.47, 0.69,
       0.87, 0.5 , 0.33, 0.51, 0.5 , 0.79, 0.26, 4.11, 3.71, 4.26, 3.16,
       3.88, 3.5 , 3.87, 2.39, 3.85, 2.94, 2.75, 3.3 , 3.24, 3.79, 2.65,
       3.72, 3.51, 3.1 , 3.85, 2.96, 3.92, 3.16, 4.13, 3.75, 3.51, 3.69,
       4.15, 4.33, 3.61, 2.57, 2.89, 2.78, 2.97, 4.22, 3.48, 3.6 , 4.01,
       3.71, 3.07, 3.09, 5.35, 4.28, 5.39, 4.77, 5.13, 6.19, 3.63, 5.74,
       5.14, 5.72, 4.44, 4.6 , 4.94, 4.25, 4.47, 4.7 , 4.73, 6.34, 6.59,
       4.22, 5.2 , 4.09, 6.31, 4.19, 5.05, 5.41, 4.05, 4.08, 4.92, 5.2 ,
       5.64, 6.12, 4.95, 4.25, 4.66, 5.88, 4.96, 4.69, 3.97, 4.88])

In [16]:
# Manhattan Distance
def manhattan_distance(train_arr, test_arr):
  mat = np.zeros((30, 120))
  for i in range(30):
    for j in range(120):
      diff1 = test_arr[i][0] - train_arr[j][0]
      diff2 = test_arr[i][1] - train_arr[j][1]
      diff3 = test_arr[i][2] - train_arr[j][2]
      diff4 = test_arr[i][3] - train_arr[j][3]

      dist = abs(diff1) + abs(diff2) + abs(diff3) + abs(diff4)
      dist = round(dist, 2)
      mat[i,j] = dist

  return mat;

manhattan_mat = manhattan_distance(train_arr, test_arr)
manhattan_mat[0]

array([ 0.3,  0.8,  0.7,  1.1,  0.3,  1.3,  0.6,  0.4,  1.4,  0.9,  0.9,
        0.7,  1. ,  1.6,  1.5,  1.9,  0.9,  0.2,  1.4,  0.6,  1. ,  0.6,
        0.9,  0.9,  1. ,  0.9,  0.5,  0.5,  0.5,  1. ,  1. ,  0.8,  1.2,
        1.4,  0.9,  0.5,  0.6,  0.9,  1.2,  0.5,  6.8,  6.1,  7.1,  5.4,
        6.7,  5.6,  6.2,  3.9,  6.5,  4.7,  4.4,  5.5,  5.7,  6.2,  4.5,
        6.3,  5.5,  5.1,  6.9,  5. ,  6.2,  5.5,  7.1,  6.1,  6. ,  6.3,
        7.1,  7.3,  6. ,  4.5,  4.9,  4.7,  5.1,  6.9,  5.3,  5.6,  6.7,
        6.6,  4.9,  5.2,  8.4,  7. ,  9. ,  7.7,  8.4, 10.2,  5.7,  9.4,
        8.7,  9.3,  7.3,  7.8,  8.3,  7.1,  7.4,  7.7,  7.7, 10.3, 11.2,
        7.2,  8.6,  6.6, 10.5,  7.2,  8.1,  8.7,  6.9,  6.7,  8.2,  8.5,
        9.5, 10. ,  8.3,  7. ,  7.4, 10. ,  7.8,  7.5,  6.5,  8.2])

In [17]:
# Minkowski Distance
def minkowski_distance(train_arr, test_arr, q):
  mat = np.zeros((30, 120))
  for i in range(30):
    for j in range(120):
      diff1 = test_arr[i][0] - train_arr[j][0]
      diff2 = test_arr[i][1] - train_arr[j][1]
      diff3 = test_arr[i][2] - train_arr[j][2]
      diff4 = test_arr[i][3] - train_arr[j][3]

      dist = ((abs(diff1))**q + (abs(diff2))**q + (abs(diff3))**q + (abs(diff4))**q)**(1/q)
      dist = round(dist, 2)
      mat[i,j] = dist

  return mat;

q = 3
minkowski_mat = minkowski_distance(train_arr, test_arr, q)
minkowski_mat[0]

array([0.14, 0.5 , 0.38, 0.52, 0.14, 0.58, 0.4 , 0.22, 0.76, 0.43, 0.43,
       0.33, 0.52, 0.79, 0.86, 1.03, 0.51, 0.13, 0.76, 0.33, 0.51, 0.26,
       0.45, 0.43, 0.61, 0.53, 0.31, 0.26, 0.22, 0.43, 0.46, 0.42, 0.62,
       0.78, 0.43, 0.31, 0.5 , 0.43, 0.7 , 0.22, 3.65, 3.34, 3.81, 2.82,
       3.46, 3.25, 3.52, 2.13, 3.46, 2.69, 2.43, 3.  , 2.85, 3.48, 2.39,
       3.3 , 3.27, 2.86, 3.37, 2.68, 3.61, 2.82, 3.72, 3.47, 3.14, 3.28,
       3.69, 3.88, 3.29, 2.29, 2.6 , 2.5 , 2.68, 3.88, 3.26, 3.3 , 3.58,
       3.26, 2.86, 2.79, 4.89, 3.91, 4.83, 4.4 , 4.66, 5.57, 3.32, 5.2 ,
       4.65, 5.09, 3.98, 4.15, 4.41, 3.85, 4.02, 4.21, 4.33, 5.69, 5.89,
       3.82, 4.64, 3.74, 5.67, 3.75, 4.58, 4.9 , 3.64, 3.72, 4.46, 4.7 ,
       5.05, 5.45, 4.47, 3.9 , 4.36, 5.17, 4.5 , 4.31, 3.62, 4.34])

In [18]:
train_species = train_df['Species'].values
train_species

array(['0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0',
       '0', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1', '1',
       '1', '1', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2',
       '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2',
       '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2', '2',
       '2', '2', '2'], dtype=object)

### Building KNN Model

In [19]:
# ************************************ My Approach ************************************
# (0 <= i <= 30) and (0 <= j <= 120)
# Step-1. Create a dataframe for ith row of test_df which contains its distance with jth row in train_df and its species
# Step-2. Sort the dataframe wrt distance in asceding order
# Step-3. Get the first k species from the species columns of sorted dataframe
# Step-4. Pick the species having max count as the class for the ith data in the test_df

In [20]:
def knn(k, distance_type, distance_name):
  print(f"KNN using {distance_name} for k = {k}\n")
  for i in range(30):

#   ************** Step-1 **************
    data = {
      "distance": distance_type[i],
      "species": train_species
    }
    df_ = pd.DataFrame(data)

#   ************** Step-2 **************
    df_.sort_values('distance', ascending=True, inplace=True)

#   ************** Step-3 **************
    arr = df_.iloc[:k, 1].values

#   ************** Step-4 **************
    zero, one, two  = 0, 0, 0
    for j in range(len(arr)):
        if(arr[j] == '0'):
          zero = zero+1
        elif(arr[j] == '1'):
          one = one+1
        elif(arr[j] == '2'):
          two = two+1

    # Find the max count Species(class)
    if(zero > one and zero > two):
      species = '0'
    elif(one > zero and one > two):
      species = '1'
    else:
      species = '2'
    print(f"Test row {i+1} belongs to class {species}")

  print("\n")

##### Euclidean Distance

In [21]:
knn(5, euclidean_mat, 'Euclidean Distance')
knn(10, euclidean_mat, 'Euclidean Distance')
knn(20, euclidean_mat, 'Euclidean Distance')

KNN using Euclidean Distance for k = 5

Test row 1 belongs to class 0
Test row 2 belongs to class 0
Test row 3 belongs to class 0
Test row 4 belongs to class 0
Test row 5 belongs to class 0
Test row 6 belongs to class 0
Test row 7 belongs to class 0
Test row 8 belongs to class 0
Test row 9 belongs to class 0
Test row 10 belongs to class 0
Test row 11 belongs to class 1
Test row 12 belongs to class 1
Test row 13 belongs to class 1
Test row 14 belongs to class 1
Test row 15 belongs to class 1
Test row 16 belongs to class 1
Test row 17 belongs to class 1
Test row 18 belongs to class 1
Test row 19 belongs to class 1
Test row 20 belongs to class 1
Test row 21 belongs to class 2
Test row 22 belongs to class 2
Test row 23 belongs to class 2
Test row 24 belongs to class 2
Test row 25 belongs to class 2
Test row 26 belongs to class 2
Test row 27 belongs to class 2
Test row 28 belongs to class 2
Test row 29 belongs to class 2
Test row 30 belongs to class 2


KNN using Euclidean Distance for k = 

##### Manhattan Distance

In [22]:
knn(5, manhattan_mat, 'Manhattan Distance')
knn(10, manhattan_mat, 'Manhattan Distance')
knn(20, manhattan_mat, 'Manhattan Distance')

KNN using Manhattan Distance for k = 5

Test row 1 belongs to class 0
Test row 2 belongs to class 0
Test row 3 belongs to class 0
Test row 4 belongs to class 0
Test row 5 belongs to class 0
Test row 6 belongs to class 0
Test row 7 belongs to class 0
Test row 8 belongs to class 0
Test row 9 belongs to class 0
Test row 10 belongs to class 0
Test row 11 belongs to class 1
Test row 12 belongs to class 1
Test row 13 belongs to class 1
Test row 14 belongs to class 1
Test row 15 belongs to class 1
Test row 16 belongs to class 1
Test row 17 belongs to class 1
Test row 18 belongs to class 1
Test row 19 belongs to class 1
Test row 20 belongs to class 1
Test row 21 belongs to class 2
Test row 22 belongs to class 2
Test row 23 belongs to class 2
Test row 24 belongs to class 2
Test row 25 belongs to class 2
Test row 26 belongs to class 2
Test row 27 belongs to class 2
Test row 28 belongs to class 2
Test row 29 belongs to class 2
Test row 30 belongs to class 2


KNN using Manhattan Distance for k = 

##### Minkowski Distance

In [23]:
knn(5, minkowski_mat, 'Minkowski Distance')
knn(10, minkowski_mat, 'Minkowski Distance')
knn(20, minkowski_mat, 'Minkowski Distance')

KNN using Minkowski Distance for k = 5

Test row 1 belongs to class 0
Test row 2 belongs to class 0
Test row 3 belongs to class 0
Test row 4 belongs to class 0
Test row 5 belongs to class 0
Test row 6 belongs to class 0
Test row 7 belongs to class 0
Test row 8 belongs to class 0
Test row 9 belongs to class 0
Test row 10 belongs to class 0
Test row 11 belongs to class 1
Test row 12 belongs to class 1
Test row 13 belongs to class 1
Test row 14 belongs to class 1
Test row 15 belongs to class 1
Test row 16 belongs to class 1
Test row 17 belongs to class 1
Test row 18 belongs to class 1
Test row 19 belongs to class 1
Test row 20 belongs to class 1
Test row 21 belongs to class 2
Test row 22 belongs to class 2
Test row 23 belongs to class 2
Test row 24 belongs to class 2
Test row 25 belongs to class 2
Test row 26 belongs to class 2
Test row 27 belongs to class 2
Test row 28 belongs to class 2
Test row 29 belongs to class 2
Test row 30 belongs to class 2


KNN using Minkowski Distance for k = 