In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler

## Importing dataset

In [6]:
data = pd.read_csv("iris.csv", index_col=0)
data.head()
scaler = MinMaxScaler()
df = data.drop('Species', axis=1) 
data_rows = scaler.fit_transform(df.to_numpy())
data_rows = pd.DataFrame(data_rows, columns=[
  'sepal_length', 'sepal_width', 'petal_length', 'petal_width'], index=df.index)
data_rows = pd.concat((data_rows, data["Species"]), axis=1)
data_rows.head()

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,Species
Id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,0.222222,0.625,0.067797,0.041667,Iris-setosa
2,0.166667,0.416667,0.067797,0.041667,Iris-setosa
3,0.111111,0.5,0.050847,0.041667,Iris-setosa
4,0.083333,0.458333,0.084746,0.041667,Iris-setosa
5,0.194444,0.666667,0.067797,0.041667,Iris-setosa


## Prepare the kNN
$$
Euclidean Distance = sqrt(sum\ i\ to\ N\ (x^1_i – x^2_i)^2)
$$

In [8]:
from math import sqrt

class kNNClassifier:
    def __init__(self, data_rows, k):
        self.data_rows = data_rows
        self.k = k
        
    def euclidean_distance(self, row_1, row_2, isLastOutput=True):
        distance = 0
        num_features = len(row_2) - 1 if isLastOutput else len(row_2)  # Ignore the last column if it is the output
        if isLastOutput is True:
            pass
        for i in range(num_features):
            distance += (row_1.iloc[i] - row_2.iloc[i]) ** 2
        return sqrt(distance)

    def get_neighbors(self, query_row):
        distances = []
        for i, row in self.data_rows.iterrows():
            distance = self.euclidean_distance(row, query_row)
            distances.append((row, distance))
        distances.sort(key=lambda x: x[1])
        neighbors = []
        for i in range(self.k):
            neighbors.append(distances[i][0])
        return neighbors
    
    def predict(self, query_row):
        neighbors = self.get_neighbors(query_row)
        output_values = [row.iloc[-1] for row in neighbors]
        prediction = max(set(output_values), key=output_values.count)
        return prediction
    def validate(self, validation_rows):
        pass


In [10]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Suppose X is your feature set and y is your target variable
X = data_rows.drop('Species', axis=1)  # Features
y = data_rows['Species']  # Target variable

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

training_data = pd.concat((X_train, y_train), axis=1)

classifier = kNNClassifier(training_data, 7)
predictions = []
for i, row in X_test.iterrows():
    predictions.append(classifier.predict(row))

accuracy = accuracy_score(y_test, predictions)
print(accuracy)


0.9666666666666667
