In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib as plt
from sklearn import preprocessing 
from sklearn.model_selection import train_test_split as tts
import scipy.spatial.distance as dist_measure
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

In [2]:
raw_data= pd.read_csv("iris.csv",names=['sepal_length','sepal_width','petal_length','petal_width','labels'])
raw_data.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,labels
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,1.9,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica
149,5.9,3.0,5.1,1.8,Iris-virginica


In [3]:
#encoding the the class labels
labelencoder= preprocessing.LabelEncoder()
raw_data['labels']= labelencoder.fit_transform(raw_data['labels'])
raw_data.tail()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,labels
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2
149,5.9,3.0,5.1,1.8,2


In [4]:
#splitting the data into training set and test set using stratify sampling
X_train, X_test, y_train, y_test = tts(raw_data, raw_data['labels'], test_size=0.2,stratify=raw_data['labels'])
print (X_train.shape, y_train.shape)

(120, 5) (120,)


In [9]:
# the function which returns the result of the predictions 
#parameters: train set, test set, type of distance used for measurement, k which the number of the nearest neighbors


def k_nearest_neighbor(train_set,test_set,dist_type,k): 
    #calculate the distance between the points for each instance in test to the points in the training set
    Y=dist_measure.cdist(X_train.iloc[:,:4], X_test.iloc[:,:4],dist_type) 
    #converting to dataframe with columns as the test data index and indices as train set index
    #each entry in the dataframe represents the distance between [train_set instance][test_set_instance].
    Y=pd.DataFrame(Y)
    #creating the dictionary with the predicted value.
    answer={}
    #looping through each of the column(test set index) and getting the k nearest indices of the training set
    # and gettting the class label from the train set and using mode to predict the encoded class label.
    for test_point in Y.columns:
        indices=Y[test_point].nsmallest(k).index.to_list()
        predicted_label=[train_set['labels'].iloc[x] for x in indices ]
        answer[test_point]=predicted_label
    return pd.DataFrame(answer).mode().dropna().T.rename(columns={0:"predicted_label_value"}).astype({"predicted_label_value": 'int32'})


def test_prediction_results(predicted_value, actual_value,encoder,k):
    predicted_value['Actual_label_value']=list(actual_value)
    predicted_value['predicted_label']=encoder.inverse_transform(predicted_value['predicted_label_value'])
    print(predicted_value)
    accuracy=accuracy_score(predicted_value['predicted_label_value'],predicted_value['Actual_label_value'] )
    print("Accuracy of this KNN classifer for k=%d is %f"%(k,accuracy))
neigh = KNeighborsClassifier(n_neighbors=20)
neigh.fit(X_train,y_train)
print(neigh.predict(X_test))


predicted_labels=k_nearest_neighbor(X_train,X_test,'euclidean',3)
test_prediction_results(predicted_labels,y_test,labelencoder,3)

predicted_labels=k_nearest_neighbor(X_train,X_test,'euclidean',20)
test_prediction_results(predicted_labels,y_test,labelencoder,20)


[1 2 0 0 2 0 0 1 0 2 1 1 2 1 0 1 1 0 2 2 1 0 1 2 1 2 2 0 2 0]
    predicted_label_value  Actual_label_value  predicted_label
0                       1                   1  Iris-versicolor
1                       2                   2   Iris-virginica
2                       0                   0      Iris-setosa
3                       0                   0      Iris-setosa
4                       2                   2   Iris-virginica
5                       0                   0      Iris-setosa
6                       0                   0      Iris-setosa
7                       1                   1  Iris-versicolor
8                       0                   0      Iris-setosa
9                       2                   2   Iris-virginica
10                      1                   1  Iris-versicolor
11                      1                   1  Iris-versicolor
12                      2                   2   Iris-virginica
13                      2                   1   Iris-vir