In [1]:
import pandas as pd

In [2]:
from sklearn.preprocessing import MinMaxScaler

In [3]:
from sklearn.model_selection import train_test_split

In [4]:
from sklearn.model_selection import GridSearchCV

In [5]:
from sklearn.metrics import classification_report

In [6]:
from sklearn.neighbors import KNeighborsClassifier

In [7]:
df = pd.read_csv("data/iris.csv")
df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [8]:
features = df.columns[:-1].tolist()
x = df[features]
y = df["Species"]

In [30]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
x_test.set_index([x for x in range(0,len(x_test))])
x_train.reindex([x for x in range(0,len(x_train))])
y_test.reindex([x for x in range(0,len(y_test))])
y_train.reindex([x for x in range(0,len(y_train))])
print(x_test)

KeyError: 'None of [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44] are in the columns'

In [10]:
mms = MinMaxScaler().fit(x_train)
x_train_n = pd.DataFrame(mms.transform(x_train), columns=features)
x_test_n = pd.DataFrame(mms.transform(x_test), columns=features)

In [11]:
x_train_n.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
0,0.176471,0.166667,0.403509,0.375
1,0.705882,0.458333,0.649123,0.583333
2,0.617647,0.5,0.754386,0.916667
3,0.705882,0.458333,0.807018,0.958333
4,0.411765,0.75,0.122807,0.083333


In [12]:
x_test_n.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
0,0.529412,0.333333,0.649123,0.458333
1,0.205882,0.583333,0.105263,0.125
2,0.235294,0.75,0.087719,0.083333
3,0.235294,0.583333,0.087719,0.041667
4,0.176471,0.458333,0.087719,0.041667


In [13]:
parameters = [{'n_neighbors': [3, 5, 6, 7, 9],
              'p': [1, 2],
              'weights': ['uniform', 'distance']}]
clf_knn = GridSearchCV(KNeighborsClassifier(), param_grid=parameters, cv=5)

In [14]:
clf_knn.fit(x_train_n, y_train)
clf_knn.best_params_

{'n_neighbors': 6, 'p': 2, 'weights': 'distance'}

In [15]:
print(classification_report(y_train, clf_knn.predict(x_train_n)))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        35
  versicolor       1.00      1.00      1.00        36
   virginica       1.00      1.00      1.00        34

    accuracy                           1.00       105
   macro avg       1.00      1.00      1.00       105
weighted avg       1.00      1.00      1.00       105



In [16]:
print(classification_report(y_test, clf_knn.predict(x_test_n)))

              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.88      1.00      0.93        14
   virginica       1.00      0.88      0.93        16

    accuracy                           0.96        45
   macro avg       0.96      0.96      0.96        45
weighted avg       0.96      0.96      0.96        45



In [17]:
distances, indices = clf_knn.best_estimator_.kneighbors(x_test_n)
print(distances)
print(indices)

[[0.0931695  0.09623859 0.1191498  0.12964602 0.12968298 0.13948947]
 [0.06815469 0.07462186 0.07900826 0.08516003 0.0931695  0.09955754]
 [0.04520949 0.05892557 0.08326121 0.11175347 0.12622514 0.13287921]
 [0.02941176 0.05393468 0.06148178 0.07462186 0.08996252 0.09495586]
 [0.04166667 0.04520949 0.05393468 0.06815469 0.06815469 0.07328879]
 [0.05393468 0.10285708 0.11086777 0.11086777 0.1127389  0.12643215]
 [0.02941176 0.0950823  0.09914316 0.11175347 0.12136685 0.12296356]
 [0.10610226 0.10888995 0.12983123 0.13635343 0.14083834 0.14353366]
 [0.04578428 0.05263158 0.05393468 0.08508946 0.08837135 0.10369535]
 [0.10700766 0.11157307 0.13766759 0.14253545 0.157053   0.16228814]
 [0.15427441 0.21789022 0.24941142 0.28973944 0.31087424 0.39046104]
 [0.06138399 0.08017148 0.08333333 0.08823529 0.10381115 0.11157307]
 [0.05882353 0.07893219 0.08508946 0.11086777 0.1184389  0.12019222]
 [0.07418966 0.0985623  0.11320973 0.11353851 0.15682333 0.16861029]
 [0.12296356 0.13130226 0.16970485

In [24]:
y_test_pred = clf_knn.predict(x_test_n)
for i, row in x_test.iterrows():
    print(i)
    print("Test instanca -----------------")
    print(row)
    print("Prava klasa -------------------")
    print(y_test.iloc[i])
    print("Dodeljena klasa ---------------")
    print(y_test_pred[i])
    print("Susedi ------------------------")
    for j, d in zip(indices[i], distances[i]):
        print(j,d)
        print(y_train[j], d)
    print("-------------------------------")
    
    print()

127
Test instanca -----------------
Sepal_Length    6.1
Sepal_Width     3.0
Petal_Length    4.9
Petal_Width     1.8
Name: 127, dtype: float64
Prava klasa -------------------


IndexError: single positional indexer is out-of-bounds