# KNN klasifikacija

## Biblioteke

In [6]:
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

from sklearn.preprocessing import StandardScaler

from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import confusion_matrix

*Standard scales* - Klasa koja transofrmise numericke atriute tako da budu iz normalne raspodele

## Podaci

In [8]:
df = pd.read_csv("data/iris.csv")

In [9]:
df.head()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width,Species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [10]:
df.describe()

Unnamed: 0,Sepal_Length,Sepal_Width,Petal_Length,Petal_Width
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [12]:
features = df.columns[:-1]
features

Index(['Sepal_Length', 'Sepal_Width', 'Petal_Length', 'Petal_Width'], dtype='object')

In [14]:
X = df[features]
y = df["Species"]
print(X.shape)
print(y.shape)

(150, 4)
(150,)


## Preprocesiranje

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=89, stratify=y)

In [17]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(105, 4)
(45, 4)
(105,)
(45,)


### Standardizacija

In [18]:
scaler = StandardScaler()
scaler.fit(X_train)


StandardScaler()

In [19]:
print(scaler.mean_)
print(scaler.var_)
print(scaler.scale_) # Standardna decijacija

[5.83619048 3.02952381 3.76190476 1.19238095]
[0.71583311 0.18855692 3.20692971 0.58146576]
[0.84606921 0.43423141 1.79079025 0.76253902]


In [20]:
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

## Treniranje modela

In [26]:
model = KNeighborsClassifier(n_neighbors=4)
model.fit(X_train, y_train)

KNeighborsClassifier(n_neighbors=4)

In [27]:
classes = model.classes_

## Performanse modela nad skupom za treniranje

In [28]:
y_train_pred = model.predict(X_train)
pd.DataFrame(confusion_matrix(y_train, y_train_pred), columns=classes, index=classes)

Unnamed: 0,setosa,versicolor,virginica
setosa,35,0,0
versicolor,0,33,2
virginica,0,3,32


## Performanse modela nad skupom za testiranje

In [29]:
y_test_pred = model.predict(X_test)
pd.DataFrame(confusion_matrix(y_test, y_test_pred), columns=classes, index=classes)

Unnamed: 0,setosa,versicolor,virginica
setosa,15,0,0
versicolor,0,15,0
virginica,0,2,13


## Automatsko testiranje raznih parametara

In [57]:
param_grid = {'n_neighbors': range(2,7),
              'weight': ['uniform', 'distance']}
GridSearchCV(KNeighborsClassifier(), param_grid)
# Pogledaj prethodni cas

GridSearchCV(estimator=KNeighborsClassifier(),
             param_grid={'n_neighbors': range(2, 7),
                         'weight': ['uniform', 'distance']})

Nedovrseno...