# K Vecinos Más Cercanos

# Importamos las librerias necesarias

In [None]:
import heapq ##Heap

from collections import Counter #Contador de ocurrencias
import operator #Para operaciones funcionales


from sklearn.datasets import make_classification #Generar datos aleatoriamente

# Uso de numpy

In [None]:
import numpy as np #Algebra Lineal

mat1 = np.random.randn(2, 3)
print(mat1.shape)

print(mat1)

In [None]:
dimensiones = (5, 3, 2, 7)
mat2 = np.random.randn(*dimensiones)

print(mat2.shape)
print(mat2)

# Uso de matplotlib

#### Podemos graficar el la matriz de (3,2) generada anteriormente

In [None]:
import matplotlib.pyplot as plt #Graficos

In [None]:
plt.plot(mat1, 'o')
plt.show()

# Definimos una funciona para simplifaar el código de la gráfica que necesitaremos

In [None]:
def plot_dataset(X, Y):
    
    # Lila: 1, Amarillo: 0
    plt.scatter(X[:, 0], X[:, 1], 
            marker='o', 
            c=Y, #Coloreamos los puntos en base a la clase a la que pertenecen
            s=30, #Tamaño de los puntos
            edgecolors='k')

# Generamos un conjunto de datos aleatorio

In [None]:
X, Y = make_classification(
    n_features=2, 
    n_redundant=0, 
    n_informative=1, 
    n_clusters_per_class=1)

In [None]:
plot_dataset(X, Y)

# Cargamos un dataset que está mejor preparado

In [None]:
X = np.load('X_knn.npy')
Y = np.load('Y_knn.npy')

In [None]:
plot_dataset(X, Y)

# Generamos un punto aleatorio que será sobre el que haremos la predicción del KNN

In [None]:
q = np.random.uniform(low=-1.6, 
                      high=2.1, 
                      size=(2))

print(q.shape)

print('Punto de consulta:', q[0])

In [None]:
plot_dataset(X, Y)
plt.plot(q[0], q[1], 'ro')
plt.show()

### Implementamos KNN y la métrica a usar

$$\hat{d} = \sqrt{\sum_{i=1}^{\#dimensiones} (p_i - q_i)^2} $$

In [None]:
def metrica(p1: list(), p2: list()):
    puntos = [(p, q) for p, q in zip(p1, p2)]
    return sum(map( lambda e: (e[0] - e[1])**2, puntos) )

![alt-text](img/pseudo.png)

In [None]:
def knn(X, Y, query, k):
    """
    X: matriz de tamaño (n_muestras, n_caracteristicas).
    Y: vector con las clases correspondientes para cada muestra.
    query: punto de consulta de dimsensiones (n_caracteristicas, ).
    k: número de vecinos a considerar para la asignación de clases.
    """
    distancias = [(metrica(x, query), y) for x, y in zip(X, Y)]
    distancias = sorted(distancias, key=lambda e: e[0])
    
    # {clase1: ocurrencias, clase2: ocurrencias, ... }
    ocurrencias = list(Counter(e[1] for e in distancias[:k]).items())
    # [(clase1: ocurrencias), (clase2: ocurrencias), ... ]
    
    return sorted(ocurrencias, key=lambda e: -e[1])[0][0]

# Probamos nuestra implementación

In [None]:
# 0: lila, 1: amarillo
q = np.random.uniform(low=-1.6, high=2.1, 
                      size=(2))

plot_dataset(X, Y)
plt.plot(q[0], q[1], 'ro')
plt.show()

In [None]:
X_new = np.vstack((X, q))
pred = knn(X, Y, q, 10)
Y_new = np.hstack((Y, pred))

plot_dataset(X_new, Y_new)

# Usando Sklearn

In [None]:
from sklearn.neighbors import KNeighborsClassifier #Generador de datos

In [None]:
model = KNeighborsClassifier(
    n_neighbors=10, 
    algorithm='brute')
model.fit(X, Y)

In [None]:
reshaped_q = np.reshape(q, (1, -1))
print(reshaped_q.shape)

pred = model.predict(reshaped_q)
print(pred)

# Comparamos nuestras predicciones

In [None]:
q = np.random.uniform(low=-1.6, high=2.1, size=(2))

In [None]:
pred_1 = knn(X, Y, q, 10)

print('manual:', pred_1)

In [None]:
reshaped_q = np.reshape(q, (1, -1))
pred_2 = model.predict(reshaped_q)[0]

print('sklearn: ', pred_2)

# Optimizaciones del KNN

In [None]:
KNeighborsClassifier(n_neighbors=10, algorithm='ball_tree')

In [None]:
KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree')

# Resolviendo el dataset del Iris

![alt-text](img/Large53.jpg)
1. sepal length in cm 
2. sepal width in cm 
3. petal length in cm 
4. petal width in cm 
5. class: Iris Setosa, Iris Versicolour, Iris Virginica

In [None]:
from sklearn.datasets import load_iris # Cargamos el dataset

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv('iris.data', 
                   names=['sepal length in cm',
                          'sepal width in cm',
                          'petal length in cm', 
                          'petal width in cm',
                          'class'])

print(data.shape)

In [None]:
data.head()

# Preparando los datos en sus respectivas matrices NUMÉRICAS

In [None]:
X = data.drop(['class'], axis=1).values

print(X.shape)
print('-'*10)
print(X[:5, :])

In [None]:
Y = pd.get_dummies(data['class']).values
Y = np.argmax(Y, axis=1)

print(Y.shape)
print('-'*10)
print(Y[:5])

pd.get_dummies(data['class']).head()

# Test Train Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=1)

print(X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

# Aplicamos nuestro modelo

In [None]:
def knn_batch(X, Y, querys, k):
    predictions = [knn(X, Y, q, k) for q in querys]
    
    return predictions

In [None]:
pred_manual = knn_batch(X_train, Y_train, X_test, 10)

print(accuracy_score(pred_manual, Y_test))
print(confusion_matrix(pred_manual, Y_test))

# Aplicamos Sklearn

In [None]:
model = KNeighborsClassifier(n_neighbors=10, algorithm='kd_tree')
model.fit(X_train, Y_train)

pred_sk = model.predict(X_test)

print(accuracy_score(pred_sk, Y_test))
print(confusion_matrix(pred_sk, Y_test))