# Load data

In [1]:
import pandas as pd 

In [2]:
path = './iris.csv'

In [3]:
iris = pd.read_csv(path)

### Check data

In [4]:
iris.head()

Unnamed: 0,5.1,3.5,1.4,0.2,Iris-setosa
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa
2,4.6,3.1,1.5,0.2,Iris-setosa
3,5.0,3.6,1.4,0.2,Iris-setosa
4,5.4,3.9,1.7,0.4,Iris-setosa


# Data preparation

### Add header row for the dataset

In [5]:
column_names=["Sepal length","Sepal width",'Petal length', 'Petal width', 'Class']

iris = pd.read_csv(path, names=column_names)
iris

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


### Change class label to 0, 1 and 2

In [6]:
iris['Class'].replace(['Iris-setosa','Iris-versicolor','Iris-virginica'], [0, 1, 2], inplace=True)
iris 

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  iris['Class'].replace(['Iris-setosa','Iris-versicolor','Iris-virginica'], [0, 1, 2], inplace=True)
  iris['Class'].replace(['Iris-setosa','Iris-versicolor','Iris-virginica'], [0, 1, 2], inplace=True)


Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


### Change all values from string to float

In [7]:
iris.loc[:, 'Sepal length':].astype(float)

Unnamed: 0,Sepal length,Sepal width,Petal length,Petal width,Class
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2.0
146,6.3,2.5,5.0,1.9,2.0
147,6.5,3.0,5.2,2.0,2.0
148,6.2,3.4,5.4,2.3,2.0


### Convert DataFrame to List

In [8]:
iris_list = iris.values.tolist()
iris_list

[[5.1, 3.5, 1.4, 0.2, 0.0],
 [4.9, 3.0, 1.4, 0.2, 0.0],
 [4.7, 3.2, 1.3, 0.2, 0.0],
 [4.6, 3.1, 1.5, 0.2, 0.0],
 [5.0, 3.6, 1.4, 0.2, 0.0],
 [5.4, 3.9, 1.7, 0.4, 0.0],
 [4.6, 3.4, 1.4, 0.3, 0.0],
 [5.0, 3.4, 1.5, 0.2, 0.0],
 [4.4, 2.9, 1.4, 0.2, 0.0],
 [4.9, 3.1, 1.5, 0.1, 0.0],
 [5.4, 3.7, 1.5, 0.2, 0.0],
 [4.8, 3.4, 1.6, 0.2, 0.0],
 [4.8, 3.0, 1.4, 0.1, 0.0],
 [4.3, 3.0, 1.1, 0.1, 0.0],
 [5.8, 4.0, 1.2, 0.2, 0.0],
 [5.7, 4.4, 1.5, 0.4, 0.0],
 [5.4, 3.9, 1.3, 0.4, 0.0],
 [5.1, 3.5, 1.4, 0.3, 0.0],
 [5.7, 3.8, 1.7, 0.3, 0.0],
 [5.1, 3.8, 1.5, 0.3, 0.0],
 [5.4, 3.4, 1.7, 0.2, 0.0],
 [5.1, 3.7, 1.5, 0.4, 0.0],
 [4.6, 3.6, 1.0, 0.2, 0.0],
 [5.1, 3.3, 1.7, 0.5, 0.0],
 [4.8, 3.4, 1.9, 0.2, 0.0],
 [5.0, 3.0, 1.6, 0.2, 0.0],
 [5.0, 3.4, 1.6, 0.4, 0.0],
 [5.2, 3.5, 1.5, 0.2, 0.0],
 [5.2, 3.4, 1.4, 0.2, 0.0],
 [4.7, 3.2, 1.6, 0.2, 0.0],
 [4.8, 3.1, 1.6, 0.2, 0.0],
 [5.4, 3.4, 1.5, 0.4, 0.0],
 [5.2, 4.1, 1.5, 0.1, 0.0],
 [5.5, 4.2, 1.4, 0.2, 0.0],
 [4.9, 3.1, 1.5, 0.1, 0.0],
 [5.0, 3.2, 1.2, 0.2

# Calculate Distance Formula

In [9]:
import math

### Euclidean Distance

In [10]:
def euclidean_distance(object1, object2): 
    distance = 0 
    for point in range(len(object1) - 1): 
        distance += (object1[point] - object2[point])**2 
    return math.sqrt(distance)

### Mahattan Distance

In [11]:
def mahattan_distance(object1, object2): 
    distance = 0 
    for point in range(len(object1) - 1): 
        distance += math.abs(object1[point] - object2[point])
    return math.sqrt(distance)

### Test

In [12]:
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]]

for row in dataset:
    distance = euclidean_distance(dataset[0], row)
    print(distance)

0.0
1.3290173915275787
1.9494646655653247
1.5591439385540549
0.5356280721938492
4.850940186986411
2.592833759950511
4.214227042632867
6.522409988228337
4.985585382449795


# Get Nearest Neighbors

In [13]:
def get_neighbors(train, test_row, num_neighbors): 
    # create a list to store distances
    distances = list()
    # run every element in training set
    for train_row in train: 
        # calculate distance for every points in training set with test one
        dist = euclidean_distance(test_row, train_row)
        # append distance and its name to the distances list in form of tuple
        distances.append((train_row, dist))
        # sort based on the distance
        distances.sort(key = lambda tup : tup[1])
        # create the list of nearest neighbor
        neighbors = list()
        # append num_neighbors neighbors with its name
    for i in range(num_neighbors): 
        neighbors.append(distances[i][0])
    return neighbors

### Test

In [14]:
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]]

neighbors = get_neighbors(dataset, dataset[0], 3)
for neighbor in neighbors:
    print(neighbor)

[2.7810836, 2.550537003, 0]
[3.06407232, 3.005305973, 0]
[1.465489372, 2.362125076, 0]


# Prediction

In [15]:
def predict_class(train, test_row, num_neighbors): 
    # create a list of neighbors
    neighbors = get_neighbors(train, test_row, num_neighbors)
    # create a list of output value
    output_value = [row[-1] for row in neighbors]
    # make prediction based on output values
    prediction = max(set(output_value), key = output_value.count)
    return prediction

### Test

In [16]:
dataset = [[2.7810836,2.550537003,0],
    [1.465489372,2.362125076,0],
    [3.396561688,4.400293529,0],
    [1.38807019,1.850220317,0],
    [3.06407232,3.005305973,0],
    [7.627531214,2.759262235,1],
    [5.332441248,2.088626775,1],
    [6.922596716,1.77106367,1],
    [8.675418651,-0.242068655,1],
    [7.673756466,3.508563011,1]]

prediction = predict_class(dataset, dataset[0], 3)
print('Expected %d, Got %d.' % (dataset[0][-1], prediction))

Expected 0, Got 0.


# Implementation

### Create values

In [17]:
test_value = iris_list[2]
num_neighbors = 4
dataset = iris_list

### Run prediction

In [18]:
prediction = predict_class(dataset, test_value, num_neighbors)
print('Expected %d, Got %d.' % (dataset[0][-1], prediction))

Expected 0, Got 0.
