In [3]:
!pip install faiss-cpu

Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (5.1 kB)
Downloading faiss_cpu-1.12.0-cp312-cp312-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl (31.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m31.4/31.4 MB[0m [31m61.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faiss-cpu
Successfully installed faiss-cpu-1.12.0


In [6]:
import faiss
import numpy as np

from sklearn.datasets import load_iris
from sklearn.neighbors import NearestNeighbors,KNeighborsClassifier,RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline,FeatureUnion
from sklearn.model_selection import GridSearchCV

15.1 : Finding an Observation's Nearest Neighbors

In [7]:
iris = load_iris()
features = iris.data
standardizer = StandardScaler()
features_standardized = standardizer.fit_transform(features)
nearest_neightbors = NearestNeighbors(n_neighbors=2).fit(features_standardized)
new_observation = [1,1,1,1]
distances,indices = nearest_neightbors.kneighbors([new_observation])
features_standardized[indices]

array([[[1.03800476, 0.55861082, 1.10378283, 1.18556721],
        [0.79566902, 0.32841405, 0.76275827, 1.05393502]]])

Discussion

In [8]:
nearestneighbors_euclidean = NearestNeighbors(n_neighbors=2,metric='euclidean').fit(features_standardized)
distances

array([[0.49140089, 0.74294782]])

In [9]:
nearestneighbors_euclidean = NearestNeighbors(n_neighbors=3,metric='euclidean').fit(features_standardized)
nearest_neighbors_with_self = nearestneighbors_euclidean.kneighbors_graph(features_standardized).toarray()
for i,x in enumerate(nearest_neighbors_with_self):
    x[i] = 0
nearest_neighbors_with_self[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

15.2 : Creating a K-Nearest Neighbors Classifier

In [10]:
iris = load_iris()
X = iris.data
y = iris.target
standardizer = StandardScaler()
X_std = standardizer.fit_transform(X)
knn = KNeighborsClassifier(n_neighbors=5,n_jobs=-1).fit(X_std,y)
new_observations = [[0.75,0.75,0.75,0.75],
                              [1,1,1,1]]
knn.predict(new_observations)

array([1, 2])

Discussion

In [11]:
knn.predict_proba(new_observations)

array([[0. , 0.6, 0.4],
       [0. , 0. , 1. ]])

In [12]:
knn.predict(new_observations)

array([1, 2])

15.3 : Identifying the Best Neighborhood Size

In [13]:
iris = load_iris()
features = iris.data
target = iris.target
standardizer = StandardScaler()
knn = KNeighborsClassifier(n_neighbors=5,n_jobs=-1)
pipe = Pipeline([("standardizer",standardizer),("knn",knn)])
search_space = [{"knn__n_neighbors":[1,2,3,4,5,6,7,8,9,10]}]
classifier = GridSearchCV(pipe,search_space,cv=5,verbose=0).fit(features_standardized,target)

Discussion

In [14]:
classifier.best_estimator_.get_params()["knn__n_neighbors"]

6

15.4 : Creating a Radius-Based Nearest Neighbors Classifier

In [17]:
iris = load_iris()
features = iris.data
target = iris.target
standardizer = StandardScaler()
features_standardized = standardizer.fit_transform(features)
rnn = RadiusNeighborsClassifier(radius=.5,n_jobs=-1).fit(features_standardized,target)

new_observations = [[1,1,1,1]]
rnn.predict(new_observations)

array([2])

15.5 : Finding Approximate Nearest Neighbors

In [18]:
iris = load_iris()
features = iris.data
standardizer = StandardScaler()
features_standardized = standardizer.fit_transform(features)
n_features = features_standardized.shape[1]
nlist = 3
k=2
quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer,n_features,nlist)
index.train(features_standardized)
index.add(features_standardized)
new_observation = np.array([[1,1,1,1]])
distances,indices = index.search(new_observation,k)
np.array(list(features_standardized[i] for i in indices[0]))

array([[1.03800476, 0.55861082, 1.10378283, 1.18556721],
       [0.79566902, 0.32841405, 0.76275827, 1.05393502]])

15.6 : Evaluating Approximate Nearest Neighbors

In [19]:
k = 10
iris = load_iris()
features = iris.data
standardizer = StandardScaler()
features_standardized = standardizer.fit_transform(features)
nearest_neightbors = NearestNeighbors(n_neighbors=k).fit(features_standardized)
n_features = features_standardized.shape[1]
nlist = 3
quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer,n_features,nlist)
index.train(features_standardized)
index.add(features_standardized)
index.nprobe = 1
new_observation = np.array([[1,1,1,1]])
knn_distances,knn_indices = nearest_neightbors.kneighbors(new_observation)
ivf_distances,ivf_indices = index.search(new_observation,k)
recall_items = set(list(knn_indices[0])) & set(list(ivf_indices[0]))
print(f"Recall @k ={k} : {len(recall_items)/k*100}%")

Recall @k =10 : 100.0%
