# <font color='blue'>K-Nearest Neighbors Classifier</font>

# 0. Dependências

In [1]:
import qgrid
import sweetviz as sv

import numpy as np
import pandas as pd

from sklearn.datasets import load_iris
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# 1. Introdução 

O KNN (K Nearest Neighbor) é um dos algoritmos mais utilizados em Machine Learning e também um dos mais simplistas. Seu método de aprendizagem é baseado em instâncias e assume que os dados tendem a estar concentrados em uma mesma região no espaço de entrada.

In [2]:
iris = load_iris()
iris.target

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
       2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2])

# 2. Dados

In [3]:
df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
df['class'] = iris.target
df['class'] = df['class'].map({0:iris.target_names[0], 1:iris.target_names[1], 2:iris.target_names[2]})
df.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [4]:
# jupyter nbextension enable --py --sys-prefix qgrid
# jupyter labextension install @jupyter-widgets/jupyterlab-manager
# Após instalação do Node.js passou a funcionar
qgrid.show_grid(df, show_toolbar=True).get_changed_df()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,virginica
146,6.3,2.5,5.0,1.9,virginica
147,6.5,3.0,5.2,2.0,virginica
148,6.2,3.4,5.4,2.3,virginica


In [5]:
# Análise Exploratória (sweetviz)
analise = sv.analyze(df)
analise.show_html('result.html')

                                             |                                             | [  0%]   00:00 ->…

Report result.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


In [6]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
count,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333
std,0.828066,0.435866,1.765298,0.762238
min,4.3,2.0,1.0,0.1
25%,5.1,2.8,1.6,0.3
50%,5.8,3.0,4.35,1.3
75%,6.4,3.3,5.1,1.8
max,7.9,4.4,6.9,2.5


In [7]:
type(iris.data)

numpy.ndarray

In [8]:
x = iris.data
y = iris.target.reshape(-1, 1)

print(x.shape, y.shape)

(150, 4) (150, 1)


In [9]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42, stratify=y)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(105, 4) (105, 1)
(45, 4) (45, 1)


# 3. Implementação

### Métricas de Distância

In [10]:
#Distância de Manhattan
def l1_distance(a, b):
    return np.sum(np.abs(a - b), axis=1)

#Distância Eucladiana
def l2_distance(a, b):
    return np.sqrt(np.sum(pow(a - b, 2), axis=1))

### Classificador

In [11]:
class kNearestNeighbor(object):
    def __init__(self, n_neighbors=1, dist_func=l1_distance):
        self.n_neighbors = n_neighbors
        self.dist_func = dist_func

    def fit(self, x, y):
        self.x_train = x
        self.y_train = y

    def predict(self, x):
        y_pred = np.zeros((x.shape[0], 1), dtype=self.y_train.dtype)

        for i, x_test in enumerate(x):
            distances = self.dist_func(self.x_train, x_test)
            nn_index = np.argsort(distances)
            nn_pred = self.y_train[nn_index[:self.n_neighbors]].ravel()
            y_pred[i] = np.argmax(np.bincount(nn_pred))

        return y_pred

## 4. Teste

In [12]:
knn = kNearestNeighbor(n_neighbors=3)
knn.fit(x_train, y_train)

y_pred = knn.predict(x_test)

print('Acurácia: {:.2f}%'.format(accuracy_score(y_test, y_pred)*100))

Acurácia: 93.33%


In [13]:
knn = kNearestNeighbor()
knn.fit(x_train, y_train)

list_res = []
for p in [1, 2]:
    knn.dist_func = l1_distance if p == 1 else l2_distance   
    
    for k in range(1, 6, 2):
        knn.n_neighbors = k
        y_pred = knn.predict(x_test)
        acc = accuracy_score(y_test, y_pred)*100
        list_res.append([k, 'l1_distance' if p == 1 else 'l2_distance', acc])
        
df = pd.DataFrame(list_res, columns=['k', 'dist. func.', 'acurácia'])
df

Unnamed: 0,k,dist. func.,acurácia
0,1,l1_distance,91.111111
1,3,l1_distance,93.333333
2,5,l1_distance,93.333333
3,1,l2_distance,93.333333
4,3,l2_distance,95.555556
5,5,l2_distance,97.777778


In [14]:
previcoes = np.array([[6.7,3.1,4.4,1.4],[4.6,3.2,1.4,0.2],[4.6,3.2,1.4,0.2],[6.4,3.1,5.5,1.8],[6.3,3.2,5.6,1.9]])
type(previcoes)

numpy.ndarray

In [15]:
# Fazendo previsões para 5 novas plantas com K igual a 3
knn.n_neighbors = 3
result = knn.predict(previcoes)
result

array([[1],
       [0],
       [0],
       [2],
       [2]])

In [16]:
# Fazendo previsões para 5 novas plantas com K igual a 5
knn.n_neighbors = 5
result = knn.predict(previcoes)
result

array([[1],
       [0],
       [0],
       [2],
       [2]])

# 5. Salvando Modelos

In [17]:
#Vamos salvar o modelo usando o joblib
import joblib
filename = 'KNeighborsClassifier_Iris_oc.sav' 
joblib.dump(knn, filename)

['KNeighborsClassifier_Iris_oc.sav']

In [18]:
#Algumas horas ou talvez dias dias depois...
#Carrega o modelo do disco
loaded_model = joblib.load(filename)
y_pred = loaded_model.predict(x_test)
result = accuracy_score(y_test, y_pred)*100
print(result)

97.77777777777777


# Fim