In [4]:
#15.1 Finding an Observation’s Nearest Neighbors

# Load libraries
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Load data
iris = datasets.load_iris()
features= iris.data

# Create standardizer
standardizer = StandardScaler()

# Standardize features
features_standardized = standardizer.fit_transform(features)

# Two nearest neighbors
nearest_neighbors = NearestNeighbors(n_neighbors=2).fit(features,features_standardized)

# Create an observation
new_observation = [1, 1, 1, 1]

# Find distances and indices of the observation's nearest neighbors
distances, indices = nearest_neighbors.kneighbors([new_observation])

# View the nearest neighbors
features_standardized[indices]


array([[[-1.62768839, -1.74335684, -1.39706395, -1.18381211],
        [-1.87002413, -0.13197948, -1.51073881, -1.44707648]]])

In [6]:
# Find two nearest neighbors based on Euclidean distance
nearestneighbors_euclidean  = NearestNeighbors(
    n_neighbors=2,metric='euclidean'
).fit(features_standardized)

# View distances
distances

array([[3.81051178, 3.96358424]])

In [11]:
# Find each observation's three nearest neighbors
# based on Euclidean distance (including itself)
nearestneighbors_euclidean = NearestNeighbors(
    n_neighbors=3, metric='euclidean'
).fit(features_standardized)

# List of lists indicating each observation's three nearest neighbors
# (including itself)
nearest_neighbors_with_self = nearestneighbors_euclidean.kneighbors_graph(
    features_standardized
).toarray()

# Remove 1s marking an observation is a nearest neighbor to itself
for i, x in enumerate(nearest_neighbors_with_self):
    x[i] = 0

# View first observation's two nearest neighbors
nearest_neighbors_with_self[0]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.])

In [13]:
#15.2 Creating a K-Nearest Neighbors Classifier

# Load libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

# Load data
iris = datasets.load_iris()
x = iris.data
y = iris.target

# Create standardizer
standardizer = StandardScaler()

# Standardize features
x_std = standardizer.fit_transform(x)

# Train a KNN classifier with 5 neighbors
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1,).fit(x_std, y)

# Create two observations
new_observation = [[0.75, 0.75, 0.75, 0.75],
                   [1, 1, 1 ,1]]

# Predict the class of two observations
knn.predict(new_observation)

array([1, 2])

In [None]:
# View probability that each observation is one of three classes
knn.predict_proba(new_observation) # کلاس با بیشترین احتمال به عنوان کلاس پیش‌بینی شده انتخاب می‌شود

array([[0. , 0.6, 0.4],
       [0. , 0. , 1. ]])

In [15]:
knn.predict(new_observation)

array([1, 2])

In [None]:
#15.3 Identifying the Best Neighborhood Size

# Load libraries
from sklearn.neighbors import KNeighborsClassifier
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import GridSearchCV

# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create standardizer
standardizer = StandardScaler()

# Create a KNN classifier
knn = KNeighborsClassifier(n_neighbors=5, n_jobs=-1)

# Create a pipeline
pipe = Pipeline([('standardizer', standardizer),('knn', knn)])
# تضمین می‌کند که هر بار که داده‌ها وارد مدل می‌شوند، ابتدا استانداردسازی اعمال شود

# Create space of candidate values
search_space = [{'knn__n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9 ,10]}]

# Create grid search
classifier = GridSearchCV(
    pipe, search_space, cv=5, verbose=0
).fit(features_standardized, target)

# Best neighborhood size (k)
classifier.best_estimator_.get_params()['knn__n_neighbors']

6

In [17]:
#15.4 Creating a Radius-Based Nearest Neighbors Classifier

# Load libraries
from sklearn.neighbors import RadiusNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn import datasets

# Load data
iris = datasets.load_iris()
features = iris.data
target = iris.target

# Create standardizer
standardizer = StandardScaler()

# Standardize features
features_standardized = standardizer.fit_transform(features)

# Train a radius neighbors classifier
rnn = RadiusNeighborsClassifier(
    radius=.5, n_jobs=-1
).fit(features_standardized, target)

# Create two observations
new_observation = [[1, 1, 1, 1]]

# Predict the class of two observations
rnn.predict(new_observation)
# کلاس این مشاهده بر اساس تمام مشاهدات داخل شعاع مشخص شده پیش‌بینی می‌شود

array([2])

In [None]:
#15.5 Finding Approximate Nearest Neighbors

import faiss
import numpy as np
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Load data
iris = datasets.load_iris()
features = iris.data

# Create standardizer
standardizer = StandardScaler()

# Standardize features
features_standardized = standardizer.fit_transform(features)

# Set faiss parameters
n_features = features_standardized.shape[1]
nlist = 3  # تعداد خوشه‌ها
k = 2  # تعداد همسایه‌های نزدیک که می‌خواهیم پیدا کنیم

# Create an IVF index
quantizer = faiss.IndexFlatIP(n_features) # محاسبه فاصله داخلی بین بردارها
index = faiss.IndexIVFFlat(quantizer, n_features, nlist) # شاخصی که با خوشه بندی فضا جستجو سرعت را بالا میبره

# Train the index and add feature vectors
index.train(features_standardized)
index.add(features_standardized)

# Create an observation
new_observation = np.array([[1, 1, 1, 1]])

# Search the index for the 2 nearest neighbors
distances, indices = index.search(new_observation, k)

# Show the feature vectors for the two nearest neighbors
np.array([list(features_standardized[i]) for i in indices[0]])

array([[1.03800476, 0.55861082, 1.10378283, 1.18556721],
       [0.79566902, 0.32841405, 0.76275827, 1.05393502]])

In [None]:
#15.6 Evaluating Approximate Nearest Neighbors

# Load libraries
import faiss
import numpy as np
from sklearn import datasets
from sklearn.neighbors import NearestNeighbors
from sklearn.preprocessing import StandardScaler

# Number of nearest neighbors
k = 10

# Load data
iris = datasets.load_iris()
features = iris.data

# Create standardizer
standardizer = StandardScaler()

# Standardize features
features_standardized = standardizer.fit_transform(features)

# Create KNN with 10 NN
nearest_neighbors = NearestNeighbors(n_neighbors=k).fit(features_standardized)

# Set faiss parameters
n_features = features_standardized.shape[1]
nlist = 3

# Create an IVF index
quantizer = faiss.IndexFlatIP(n_features)
index = faiss.IndexIVFFlat(quantizer, n_features, nlist)

# Train the index and add feature vectors
index.train(features_standardized)
index.add(features_standardized)
index.nprobe = 1 # در زمان جست‌وجو فقط ۱ خوشه کاوش می‌ شود

# Create an observation
new_observation = np.array([[1, 1, 1 ,1]])

# Find distances and indices of the observation's exact nearest neighbors
knn_distances, knn_indices = nearest_neighbors.kneighbors(new_observation)

# Search the index for the two nearest neighbors
ivf_distances, ivf_indices = index.search(new_observation, k)

# Get the set overlap
recalled_items = set(list(knn_distances[0])) & set(list(ivf_distances[0]))

# Print the recall
print(f"Recall @k={k}: {len(recalled_items)/k * 100}%")

Recall @k=10: 0.0%
