# k-Nearest Neighbors 

# Similarity and Distance Measures

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import numpy as np
import pandas as pd

sentence1 = "This is the first sentence in English."
sentence2 = "Another sentence also in English."
sentence3 = "This is not a sentence."

documents = [sentence1 , sentence2 , sentence3]
cV = CountVectorizer() # count how often each word appears

doc_term_matrix = cV.fit_transform(documents).todense()

In [2]:
doc_term_matrix

matrix([[0, 0, 1, 1, 1, 1, 0, 1, 1, 1],
        [1, 1, 1, 0, 1, 0, 0, 1, 0, 0],
        [0, 0, 0, 0, 0, 1, 1, 1, 0, 1]], dtype=int64)

In [3]:
from sklearn.metrics.pairwise import cosine_similarity

# convert matrix to numpy
doc_term_matrix = np.asarray(doc_term_matrix)

cosine_similarity(doc_term_matrix)

array([[1.        , 0.50709255, 0.56694671],
       [0.50709255, 1.        , 0.2236068 ],
       [0.56694671, 0.2236068 , 1.        ]])

In [4]:
from sklearn.metrics import pairwise_distances
for m in ['manhattan','euclidean']:
    print(m)
    print(pairwise_distances(doc_term_matrix, metric=m))

manhattan
[[0. 6. 5.]
 [6. 0. 7.]
 [5. 7. 0.]]
euclidean
[[0.         2.44948974 2.23606798]
 [2.44948974 0.         2.64575131]
 [2.23606798 2.64575131 0.        ]]


# kNN for Penguins

In [5]:
from sklearn import neighbors
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
    
n_neighbors = 15

# import some data to play with
penguins = pd.read_csv('penguins_size.csv').dropna()

# we only take the first two features to make it more challenging
X = penguins[['culmen_length_mm', 'culmen_depth_mm']].values
le = LabelEncoder()
y = le.fit_transform(penguins['species'])

# Optional: scale the features (helps KNN if scales vary)
scaler = StandardScaler()
X = scaler.fit_transform(X)

# create an instance of Neighbor Classifier and fit the data
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform')
clf.fit(X, y)
accuracy_score(y, clf.predict(X))

0.9700598802395209

In [6]:
# --->>> Your Turn <<<---
# Experiment with different settings for the parameters:
#  - n_neighbors
#  - weights 
#  - algorithm
#  - leaf_size
# (see: https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
# What do you observe?

In [None]:
# --->>> Your Turn <<<---
# Use radius-neighbors instead of k-nearest-neighbors

In [None]:
# --->>> Your Turn <<<---
# What do you think about the generalsiation ability of the k-nearest-neighbors algorithm?

# It memorizes, but does not generalize
# kNN does not have a training step (only storing of information)!
# kNN has a big problem with the curse of dimensionality!