In [2]:
import pandas as pd
import numpy as np
import warnings
import math
warnings.filterwarnings('ignore')
pd.set_option('max_rows', 200)

# Columns from given data, includes class
cols = ['sepal length', 'sepal width', 'petal length', 'petal width', 'class']
# Columns with only numeric values
numCols = ['sepal length', 'sepal width', 'petal length', 'petal width']
# Create DataFrame with values read from txt file
data = pd.DataFrame(pd.read_csv('iris.txt', header=None, names=cols))

# Testing sample data
samples = pd.DataFrame({
    'sepal length': [4.9, 4.9, 4.9],
    'sepal width': [3.0, 2.4, 2.5],
    'petal length': [1.4, 3.3, 4.5],
    'petal width': [0.2, 1.0, 1.7]    
})

# Normalize training set and testing samples
dataNum = data[numCols]
sampleNum = samples[numCols]
dataMin = dataNum - dataNum.min()
normalData = (dataNum - dataNum.min()) / dataMin.max()
normalSample = (sampleNum - dataNum.min()) / dataNum.max()

# Find euclidian distance between all attributes(except class) of chosen sample 
# against training set
def eucDist(row):
    diff = 0;
    for k in numCols:
        diff += ((row[k] - chosenSample[k]) ** 2)
    return math.sqrt(diff)

chosenSample= normalSample.iloc[0]

row = eucDist(normalData.iloc[0])

In [3]:
# Sample 1
# Set chosen to sample 1
chosenSample= normalSample.iloc[0]

# Calculate euclidian distances from training set to sample
distances = normalData.apply(eucDist, axis=1)
distFrame = pd.DataFrame(data={"dist": distances, "idx": distances.index})

# Create list of distances and sort from least to greatest
closest = distFrame.sort_values(by=['dist'])

# To find nearest neighbor, we just need to look at the top values of the list
# depending on how many neighbors we want to look at.
# In the event there were nearest neighbors with different classes, we would 
# predict the class that comes up the most often. If there is a tie between
# classes the results are inconclusive, this is why an odd number of neighbors
# is usually used.
r = closest.iloc[0].idx
print("Nearest neighbor is entry #", int(r))
print("With values: ")
print(data.iloc[int(r)], "\n")

print("Three nearest neighbors\n")
for i in range(3):
    r = closest.iloc[i].idx
    print(i+1, " : ", data.iloc[int(r)]['class'], " at index: ", int(r))
print("\nFive nearest neighbors\n")
for i in range(5):
    r = closest.iloc[i].idx
    print(i+1, " : ", data.iloc[int(r)]['class'], " at index: ", int(r))
    
print("\nFor each nearest neighbor, 3NN, and 5NN, the class is Iris-setosa")
print("So we can predict sample 0 will also be Iris-setosa")    

Nearest neighbor is entry # 40
With values: 
sepal length            4.5
sepal width             2.3
petal length            1.3
petal width             0.3
class           Iris-setosa
Name: 40, dtype: object 

Three nearest neighbors

1  :  Iris-setosa  at index:  40
2  :  Iris-setosa  at index:  7
3  :  Iris-setosa  at index:  37

Five nearest neighbors

1  :  Iris-setosa  at index:  40
2  :  Iris-setosa  at index:  7
3  :  Iris-setosa  at index:  37
4  :  Iris-setosa  at index:  11
5  :  Iris-setosa  at index:  44

For each nearest neighbor, 3NN, and 5NN, the class is Iris-setosa
So we can predict sample 0 will also be Iris-setosa


In [4]:
# Sample 1
# Set chosen to sample 1
chosenSample= normalSample.iloc[1]

# Calculate euclidian distances from training set to sample
distances = normalData.apply(eucDist, axis=1)
distFrame = pd.DataFrame(data={"dist": distances, "idx": distances.index})

# Create list of distances and sort from least to greatest
closest = distFrame.sort_values(by=['dist'])

# To find nearest neighbor we just need to look at the top values of the list
r = closest.iloc[0].idx
print("Nearest neighbor is entry #", int(r))
print("With values: ")
print(data.iloc[int(r)], "\n")

print("Three nearest neighbors\n")
for i in range(3):
    r = closest.iloc[i].idx
    print(i+1, " : ", data.iloc[int(r)]['class'], " at index: ", int(r))
print("\nFive nearest neighbors\n")
for i in range(5):
    r = closest.iloc[i].idx
    print(i+1, " : ", data.iloc[int(r)]['class'], " at index: ", int(r))
    
print("\nFor each nearest neighbor, 3NN, and 5NN, the class is Iris-versicolor")
print("So we can predict sample 0 will also be Iris-versicolor")

Nearest neighbor is entry # 91
With values: 
sepal length                  5
sepal width                 2.3
petal length                3.3
petal width                   1
class           Iris-versicolor
Name: 91, dtype: object 

Three nearest neighbors

1  :  Iris-versicolor  at index:  91
2  :  Iris-versicolor  at index:  58
3  :  Iris-versicolor  at index:  96

Five nearest neighbors

1  :  Iris-versicolor  at index:  91
2  :  Iris-versicolor  at index:  58
3  :  Iris-versicolor  at index:  96
4  :  Iris-versicolor  at index:  79
5  :  Iris-versicolor  at index:  78

For each nearest neighbor, 3NN, and 5NN, the class is Iris-versicolor
So we can predict sample 0 will also be Iris-versicolor


In [5]:
# Sample 2
# Set chosen to sample 2
chosenSample= normalSample.iloc[2]

# Calculate euclidian distances from training set to sample
distances = normalData.apply(eucDist, axis=1)
distFrame = pd.DataFrame(data={"dist": distances, "idx": distances.index})

# Create list of distances and sort from least to greatest
closest = distFrame.sort_values(by=['dist'])

# To find nearest neighbor we just need to look at the top values of the list
r = closest.iloc[0].idx
print("Nearest neighbor is entry #", int(r))
print("With values: ")
print(data.iloc[int(r)], "\n")

print("Three nearest neighbors\n")
for i in range(3):
    r = closest.iloc[i].idx
    print(i+1, " : ", data.iloc[int(r)]['class'], " at index: ", int(r))
print("\nFive nearest neighbors\n")
for i in range(5):
    r = closest.iloc[i].idx
    print(i+1, " : ", data.iloc[int(r)]['class'], " at index: ", int(r))
    
print("\nFor each nearest neighbor, 3NN, and 5NN, the class is Iris-versicolor")
print("So we can predict sample 0 will also be Iris-versicolor")

Nearest neighbor is entry # 57
With values: 
sepal length                5.2
sepal width                 2.7
petal length                3.9
petal width                 1.4
class           Iris-versicolor
Name: 57, dtype: object 

Three nearest neighbors

1  :  Iris-versicolor  at index:  57
2  :  Iris-versicolor  at index:  52
3  :  Iris-versicolor  at index:  87

Five nearest neighbors

1  :  Iris-versicolor  at index:  57
2  :  Iris-versicolor  at index:  52
3  :  Iris-versicolor  at index:  87
4  :  Iris-versicolor  at index:  91
5  :  Iris-versicolor  at index:  58

For each nearest neighbor, 3NN, and 5NN, the class is Iris-versicolor
So we can predict sample 0 will also be Iris-versicolor
