In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('iris.data', header=None)

In [3]:
df.head()

Unnamed: 0,0,1,2,3,4
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa


In [4]:
df.columns = ['sepal length in cm', 'sepal width in cm', 'petal length in cm', 'petal width in cm', 'class']

In [5]:
df['class'].unique()

array(['Iris-setosa', 'Iris-versicolor', 'Iris-virginica'], dtype=object)

In [6]:
class_label_mapping = {'Iris-setosa' : 0, 'Iris-versicolor' : 1, 'Iris-virginica' : 2}

In [7]:
df['class'] = df['class'].map(class_label_mapping)

In [8]:
y = df['class'].values

In [9]:
df.drop(axis=1, inplace=True, labels='class')

In [10]:
X = df.values

In [11]:
print(X.shape)
print(y.shape)

(150, 4)
(150,)


In [12]:
y[0:5]

array([0, 0, 0, 0, 0], dtype=int64)

In [13]:
X[0:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [14]:
from sklearn.model_selection import train_test_split

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.9, random_state=42, shuffle=True, stratify=y)

# Standardization

In [16]:
from sklearn.preprocessing import StandardScaler

In [17]:
std = StandardScaler()

In [18]:
X_train_std = std.fit_transform(X_train)
X_test_std = std.transform(X_test)

In [19]:
X_train_std[0:5]

array([[ 1.19452367,  0.30570892,  0.62281135,  0.49431153],
       [-0.10859306, -0.15285446,  0.24038333,  0.08238526],
       [-1.62889591, -0.15285446, -1.45322649, -1.42801109],
       [-0.10859306, -0.61141784,  0.24038333,  0.21969401],
       [-0.76015142,  2.13996244, -1.28932877, -1.15339358]])

In [20]:
X_test_std[0:5]

array([[-1.30311673,  0.30570892, -1.23469619, -1.29070234],
       [-0.76015142,  2.13996244, -1.28932877, -1.29070234],
       [-0.76015142,  3.51565258, -1.18006362, -1.29070234],
       [-1.30311673,  0.7642723 , -1.28932877, -1.29070234],
       [-0.10859306, -1.06998122,  0.40428105,  0.21969401]])

# Euclidean Distance

In [21]:
def euclidean_distance(X, line):
    X_  = (X - line) ** 2
    return np.sqrt(np.sum(X_, axis=1))

In [22]:
df['class'] = y

In [23]:
df.head()

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [24]:
X[0:1,]

array([[5.1, 3.5, 1.4, 0.2]])

In [25]:
X_line = X[0:1,]

In [26]:
df['euclidean_distance'] = euclidean_distance(X, X_line)

In [27]:
df.sort_values(by='euclidean_distance')[0:25]

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class,euclidean_distance
0,5.1,3.5,1.4,0.2,0,0.0
17,5.1,3.5,1.4,0.3,0,0.1
4,5.0,3.6,1.4,0.2,0,0.141421
39,5.1,3.4,1.5,0.2,0,0.141421
27,5.2,3.5,1.5,0.2,0,0.141421
28,5.2,3.4,1.4,0.2,0,0.141421
40,5.0,3.5,1.3,0.3,0,0.173205
7,5.0,3.4,1.5,0.2,0,0.173205
49,5.0,3.3,1.4,0.2,0,0.223607
21,5.1,3.7,1.5,0.4,0,0.3


# Manhattan Distance

In [28]:
def manhattan_distance(X, line):
    X_ = abs((X - line))
    return np.sum(X_, axis=1)

In [29]:
df['manhattan_distance'] = manhattan_distance(X, X_line)

In [30]:
df.sort_values(by='manhattan_distance')[0:25]

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class,euclidean_distance,manhattan_distance
0,5.1,3.5,1.4,0.2,0,0.0,0.0
17,5.1,3.5,1.4,0.3,0,0.1,0.1
4,5.0,3.6,1.4,0.2,0,0.141421,0.2
39,5.1,3.4,1.5,0.2,0,0.141421,0.2
28,5.2,3.4,1.4,0.2,0,0.141421,0.2
27,5.2,3.5,1.5,0.2,0,0.141421,0.2
40,5.0,3.5,1.3,0.3,0,0.173205,0.3
7,5.0,3.4,1.5,0.2,0,0.173205,0.3
49,5.0,3.3,1.4,0.2,0,0.223607,0.3
19,5.1,3.8,1.5,0.3,0,0.331662,0.5


# Chebyshev Distance

In [31]:
def chebyshev_distance(X, line):
    X_ = abs((X - line))
    return np.max(X_, axis=1)

In [32]:
df['chebyshev_distance'] = chebyshev_distance(X, X_line)

In [33]:
df.sort_values(by='chebyshev_distance')[0:25]

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class,euclidean_distance,manhattan_distance,chebyshev_distance
0,5.1,3.5,1.4,0.2,0,0.0,0.0,0.0
17,5.1,3.5,1.4,0.3,0,0.1,0.1,0.1
40,5.0,3.5,1.3,0.3,0,0.173205,0.3,0.1
39,5.1,3.4,1.5,0.2,0,0.141421,0.2,0.1
4,5.0,3.6,1.4,0.2,0,0.141421,0.2,0.1
7,5.0,3.4,1.5,0.2,0,0.173205,0.3,0.1
27,5.2,3.5,1.5,0.2,0,0.141421,0.2,0.1
28,5.2,3.4,1.4,0.2,0,0.141421,0.2,0.1
26,5.0,3.4,1.6,0.4,0,0.316228,0.6,0.2
48,5.3,3.7,1.5,0.2,0,0.3,0.5,0.2


# Minkowski Distance

In [34]:
def minkowski_distance(X, line, p):
    X_ = (abs((X - line)) ** p) ** (1 / p)
    return np.sum(X_, axis=1)

In [35]:
p = 3

In [36]:
df['minkowski_distance'] = minkowski_distance(X, X_line, p)

In [37]:
df.sort_values(by='minkowski_distance')[0:25]

Unnamed: 0,sepal length in cm,sepal width in cm,petal length in cm,petal width in cm,class,euclidean_distance,manhattan_distance,chebyshev_distance,minkowski_distance
0,5.1,3.5,1.4,0.2,0,0.0,0.0,0.0,0.0
17,5.1,3.5,1.4,0.3,0,0.1,0.1,0.1,0.1
4,5.0,3.6,1.4,0.2,0,0.141421,0.2,0.1,0.2
39,5.1,3.4,1.5,0.2,0,0.141421,0.2,0.1,0.2
28,5.2,3.4,1.4,0.2,0,0.141421,0.2,0.1,0.2
27,5.2,3.5,1.5,0.2,0,0.141421,0.2,0.1,0.2
40,5.0,3.5,1.3,0.3,0,0.173205,0.3,0.1,0.3
7,5.0,3.4,1.5,0.2,0,0.173205,0.3,0.1,0.3
49,5.0,3.3,1.4,0.2,0,0.223607,0.3,0.2,0.3
19,5.1,3.8,1.5,0.3,0,0.331662,0.5,0.3,0.5


# KNN Brute Force

In [38]:
def idx_knn(X, line, k):
    ed = euclidean_distance(X, line)
    idx_sort = np.argsort(ed)
    return idx_sort[0:k]

# KNN Classifier

In [39]:
def knn_classifier(X, y, line, k):
    idx_knn_ = idx_knn(X, line, k=k)
    count = np.bincount(y[idx_knn_])
    return np.argmax(count)

In [40]:
X_train_std[0:1,]

array([[1.19452367, 0.30570892, 0.62281135, 0.49431153]])

In [41]:
y_train[:,]

array([1, 1, 0, 1, 0, 2, 0, 2, 1, 0, 1, 0, 2, 2, 2], dtype=int64)

In [42]:
line = X_train_std[0:1,]

In [43]:
k = 1
knn_classifier(X_train_std, y_train, line, k)

1

In [44]:
k = 3
knn_classifier(X_train_std, y_train, line, k)

2

In [45]:
k = 5
knn_classifier(X_train_std, y_train, line, k)

2

In [46]:
from sklearn.neighbors import KNeighborsClassifier

In [47]:
# k = 1
clf = KNeighborsClassifier(n_neighbors=1)
clf.fit(X_train_std, y_train)
clf.predict(line)

array([1], dtype=int64)

In [48]:
# k = 3
clf = KNeighborsClassifier(n_neighbors=3)
clf.fit(X_train_std, y_train)
clf.predict(line)

array([2], dtype=int64)

In [49]:
# k = 5
clf = KNeighborsClassifier(n_neighbors=5)
clf.fit(X_train_std, y_train)
clf.predict(line)

array([2], dtype=int64)

# KNN Regressor

In [50]:
def knn_regressor(X, y, line, k):
    idx_knn_ = idx_knn(X, line, k=k)
    return np.mean(y[idx_knn_])

In [51]:
k = 1
knn_regressor(X_train_std, y_train, line, k)

1.0

In [52]:
k = 3
knn_regressor(X_train_std, y_train, line, k)

1.6666666666666667

In [53]:
k = 5
knn_regressor(X_train_std, y_train, line, k)

1.6

In [54]:
from sklearn.neighbors import KNeighborsRegressor

In [55]:
# k = 1
knr = KNeighborsRegressor(n_neighbors=1)
knr.fit(X_train_std, y_train)
knr.predict(line)

array([1.])

In [56]:
# k = 3
knr = KNeighborsRegressor(n_neighbors=3)
knr.fit(X_train_std, y_train)
knr.predict(line)

array([1.66666667])

In [57]:
# k = 5
knr = KNeighborsRegressor(n_neighbors=5)
knr.fit(X_train_std, y_train)
knr.predict(line)

array([1.6])