# 3. Visual search with k-NN

## Loading Image datasetset

In [None]:
import numpy as np

# Load the .npz file
data = np.load('dataset_features.npz')

# List all arrays within the .npz file
print(data.files)

# Access individual arrays by their names
X_train = data['trainset_features']
y_train = data['trainset_labels']

X_val = data['validset_features']
y_val = data['validset_labels']

X_test = data['testset_features']
y_test = data['testset_labels']

class_labels = data['class_labels']


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Create a k-NN pipeline
knn_pipe = Pipeline(
    [("scaler", StandardScaler()), 
     ("knn", KNeighborsClassifier(n_neighbors=7))]
)

In [None]:
import pandas as pd

# Variable to store the results
gs_results = []

# Generate a set of k values
k_values = np.arange(1, 50, step=1)

# Grid search
for k in k_values:
    # Fit k-NN model
    knn_pipe.set_params(knn__n_neighbors=k)
    knn_pipe.fit(X_train, y_train)

    # Save model and its performance on training/validation sets
    gs_results.append(
        {
            "k": k,
            "train_accuracy": knn_pipe.score(X_train, y_train),
            "valid_accuracy": knn_pipe.score(X_val, y_val),
        }
    )

# Convert results to DataFrame
gs_results = pd.DataFrame(gs_results)
gs_results.sort_values(by="valid_accuracy", ascending=False).head()

In [None]:
import matplotlib.pyplot as plt

# Plot the validation curves
plt.plot(gs_results["k"], gs_results["train_accuracy"], label="train curve")
plt.plot(gs_results["k"], gs_results["valid_accuracy"], label="validation curve")
plt.ylabel("accuracy")
plt.xlabel("k")
plt.grid(True)
plt.legend()

Low values of k, present high training accuracy, but in this region we're overfitting as can be seen by the accuracy gap between training and validation sets.

For higher k, both training and validation sets accuracy decreases since we're now underfitting and increasing the model variance. 

Tuned classifier that maximizes the accuracy in validation set has k=7

In [None]:
# Create a k-NN pipeline
knn_pipe_tuned = Pipeline(
    [("scaler", StandardScaler()), 
     ("knn", KNeighborsClassifier(n_neighbors=5))]
)

knn_pipe_tuned.fit(X_train, y_train)

## Scores of tuned model

In [None]:
print('Model Accuracy:')
acc_train = knn_pipe_tuned.score(X_train, y_train)
print(f'On train set: {acc_train:.3f}')
acc_val = knn_pipe_tuned.score(X_val, y_val)
print(f'On valid set: {acc_val:.3f}')
acc_test = knn_pipe_tuned.score(X_test, y_test)
print(f'On test  set: {acc_test:.3f}')

In [None]:
import pandas as pd
import pickle

results_accuracy = pd.DataFrame({'model':['k-nn'], 'test_accuracy':[acc_test]})

with open('model_accuracy.pickle', 'wb') as file:
    pickle.dump(results_accuracy, file)

## Classification Report

In [None]:
from sklearn.metrics import classification_report

# Classification report
y_test_preds = knn_pipe_tuned.predict(X_test)

print(classification_report(y_true=y_test, y_pred=y_test_preds, target_names=class_labels))

Precision: Measures the accuracy of positive predictions for each class. It is the proportion of true positives out of all instances classified as that class. 

Recall: Measures the ability of the classifier to identify all true positives for each class. It is the proportion of true positives out of all actual instances of that class. 

Support: The number of actual instances in each class. Categories like bike (33), car (32), and motorcycle (25) have a higher number of samples, which typically leads to more reliable metrics.

## Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Define class labels
class_labels = ['bike', 'car', 'motorcycle', 'other', 'truck', 'van']

y_test_decoded = np.argmax(y_test, axis=1)
y_pred_decoded = np.argmax(y_test_preds, axis=1)

# Compute the confusion matrix
cm = confusion_matrix(y_test_decoded, y_pred_decoded)

# Plot confusion matrix as heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, xticklabels=class_labels, yticklabels=class_labels)
plt.xlabel("Predicted Label")
plt.ylabel("True Label")
plt.title("Confusion Matrix of Test Dataset")
plt.show()

## Correct classifications and nearest neighbours

In [None]:
import numpy as np

# Find the correctly classified test images
correct_indices = np.where(y_test_decoded == y_pred_decoded)[0]
print(f'Number of correctly classified images: {len(correct_indices)}')

In [None]:
from tensorflow.keras.preprocessing.image import ImageDataGenerator

# Define image size and scaling
image_size = (224, 224)
batch_size = 32

datagen = ImageDataGenerator(rescale=1./255)  # Adjust rescaling if needed
test_images = datagen.flow_from_directory('./test', target_size=image_size, batch_size=batch_size, shuffle=False)
train_images = datagen.flow_from_directory('./train', target_size=image_size, batch_size=batch_size, shuffle=False)

Select an image index

In [None]:
from util import get_image_by_index

idx = 10
chosen_index = correct_indices[idx]

image, label = get_image_by_index(test_images, chosen_index)
plt.imshow(image)
print(f'Selection of correctly classified image')
print(f'Index: {chosen_index}')
print(f'Label: {class_labels[label]}')

Let's now find the nearest image entries to selected test imageset

In [None]:
scaler = knn_pipe_tuned.named_steps['scaler']
knn = knn_pipe_tuned.named_steps['knn']

knn_scale = Pipeline(
    [("scaler", scaler), 
     ]
)

X_test_transf = knn_scale.transform(X_test)

In [None]:
from sklearn.neighbors import NearestNeighbors

nearest_distances, nearest_indices = knn.kneighbors(X_test_transf[chosen_index,:].reshape(1, -1), n_neighbors=10)

nearest_distances = nearest_distances[0]
nearest_indices = nearest_indices[0]

print(f'Nearest distances: {nearest_distances}')
print(f'Nearest indices  : {nearest_indices}')

In [None]:
# Plot the chosen test image
fig, axes = plt.subplots(1, 11, figsize=(20, 2.5))

chosen_image, chosen_label = get_image_by_index(test_images, chosen_index)
axes[0].imshow(chosen_image)
axes[0].set_title("Test Image")
axes[0].axis("off")

# Plot the 10 nearest neighbors from the training set
for i, nearest_index in enumerate(nearest_indices, start=1):
       
    nearest_image, nearest_label = get_image_by_index(train_images, nearest_index)
    axes[i].imshow(nearest_image)
    axes[i].set_title(f"d: {nearest_distances[i-1]:.0f}")  
    axes[i].axis("off")

plt.suptitle("10 Nearest Neighbors of a Correctly Classified Test Image")
plt.show()


## Incorrect classifications and nearest neighbours

In [None]:
# Find the incorrectly classified test images
incorrect_indices = np.where(y_test_decoded != y_pred_decoded)[0]
print(f'Number of incorrectly classified images: {len(incorrect_indices)}')

Select an image index

In [None]:
idx = 0
chosen_index = incorrect_indices[idx]

image, label = get_image_by_index(test_images, chosen_index)
plt.imshow(image)
print(f'Selection of correctly classified image')
print(f'Index: {chosen_index}')
print(f'Label: {class_labels[label]}')

Let's now find the nearest image entries to selected test imageset

In [None]:
from sklearn.neighbors import NearestNeighbors

nearest_distances, nearest_indices = knn.kneighbors(X_test_transf[chosen_index,:].reshape(1, -1), n_neighbors=10)

nearest_distances = nearest_distances[0]
nearest_indices = nearest_indices[0]

print(f'Nearest distances: {nearest_distances}')
print(f'Nearest indices  : {nearest_indices}')

In [None]:
# Plot the chosen test image
fig, axes = plt.subplots(1, 11, figsize=(20, 2.5))

chosen_image, chosen_label = get_image_by_index(test_images, chosen_index)
axes[0].imshow(chosen_image)
axes[0].set_title("Test Image")
axes[0].axis("off")

# Plot the 10 nearest neighbors from the training set
for i, nearest_index in enumerate(nearest_indices, start=1):
       
    nearest_image, nearest_label = get_image_by_index(train_images, nearest_index)
    axes[i].imshow(nearest_image)
    axes[i].set_title(f"d: {nearest_distances[i-1]:.0f}")  
    axes[i].axis("off")

plt.suptitle("10 Nearest Neighbors of a Correctly Classified Test Image")
plt.show()
