In [2]:
import numpy as np
import pickle
from tqdm import tqdm, tqdm_notebook
import random
import time
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
import PIL
from PIL import Image
from sklearn.neighbors import NearestNeighbors

import glob
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
%matplotlib inline

In [3]:
filenames = pickle.load(open('data/filenames-caltech101.pickle', 'rb'))
feature_list = pickle.load(open('data/features-caltech101-mobilenet.pickle',
                                'rb'))
class_ids = pickle.load(open('data/class_ids-caltech101.pickle', 'rb'))

In [4]:
num_images = len(filenames)
num_features_per_image = len(feature_list[0])
print("Number of images = ", num_images)
print("Number of features per image = ", num_features_per_image)

Number of images =  6411
Number of features per image =  1024


In [5]:
random_image_index = random.randint(0, num_images)

Brute force nearest neighbour on one image

In [10]:
%timeit NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean').fit(feature_list)
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='brute',
                             metric='euclidean').fit(feature_list)

31.9 ms ± 4.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [11]:
%timeit neighbors.kneighbors([feature_list[random_image_index]])

76.7 ms ± 7.07 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


k-d Tree Algorithm on one image

In [8]:

%timeit NearestNeighbors(n_neighbors=5, algorithm='kd_tree').fit(feature_list)
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='kd_tree').fit(feature_list)

2.79 s ± 216 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [9]:
%timeit neighbors.kneighbors([feature_list[random_image_index]])

14.7 ms ± 992 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


Ball Tree Algorithm on one image

In [12]:
%timeit NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(feature_list)
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='ball_tree').fit(feature_list)

1.95 s ± 148 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:

%timeit neighbors.kneighbors([feature_list[random_image_index]])

11.4 ms ± 951 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [14]:

random_image_indices = random.sample(range(0, num_images), 1000)
random_feature_list = [
    feature_list[each_index] for each_index in random_image_indices
]

 Brute Force Algorithm on a set of images

In [13]:
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='brute',
                             metric='euclidean').fit(feature_list)
%timeit neighbors.kneighbors(feature_list)

1.77 s ± 25.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


 k-d Tree Algorithm on a set of images

In [16]:
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='kd_tree').fit(feature_list)
%timeit neighbors.kneighbors(random_feature_list)

14.3 s ± 368 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


 Ball Tree Algorithm on a set of images

In [None]:
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='ball_tree').fit(feature_list)
%timeit neighbors.kneighbors(random_feature_list)

# PCA

In [5]:
num_feature_dimensions = 100
num_feature_dimensions = min(num_images, num_feature_dimensions,
                             len(feature_list[0]))

train pca

In [6]:
pca = PCA(n_components=num_feature_dimensions)
pca.fit(feature_list)
feature_list_compressed = pca.transform(feature_list)
feature_list_compressed = feature_list_compressed.tolist()

In [7]:
print(pca.explained_variance_ratio_[0:20])

[0.04808487 0.03902655 0.03520469 0.0229082  0.01972816 0.01739131
 0.01555806 0.01291987 0.01241799 0.01133363 0.01028689 0.00926208
 0.00849703 0.00841846 0.00785455 0.00749922 0.00687655 0.00677042
 0.00629248 0.00604648]


PCA + Brute Force Algorithm on one image

In [20]:
%timeit NearestNeighbors(n_neighbors=5, algorithm='brute', metric='euclidean').fit(feature_list_compressed)
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='brute',
                             metric='euclidean').fit(feature_list_compressed)

30.7 ms ± 3.38 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [22]:
%timeit neighbors.kneighbors([feature_list_compressed[random_image_index]])


2.88 ms ± 580 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


PCA + k-d Tree Algorithm on one image

In [23]:
%timeit NearestNeighbors(n_neighbors=5, algorithm='kd_tree').fit(feature_list_compressed)
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='kd_tree').fit(feature_list_compressed)

194 ms ± 8.82 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [24]:
%timeit neighbors.kneighbors([feature_list_compressed[random_image_index]])


1.81 ms ± 82.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


PCA + Ball Tree Algorithm on one image

In [25]:
%timeit NearestNeighbors(n_neighbors=5, algorithm='ball_tree').fit(feature_list_compressed)
neighbors = NearestNeighbors(
    n_neighbors=5, algorithm='ball_tree').fit(feature_list_compressed)

117 ms ± 8.84 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [26]:
%timeit neighbors.kneighbors([feature_list_compressed[random_image_index]])

1.47 ms ± 84 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [10]:
random_image_indices = random.sample(range(0, num_images), 1000)
random_feature_list_compressed = [
    feature_list_compressed[each_index] for each_index in random_image_indices
]

PCA + Brute Force Algorithm on a set of images

In [11]:
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='brute',
                             metric='euclidean').fit(feature_list_compressed)
%timeit neighbors.kneighbors(feature_list_compressed)

1.57 s ± 69.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


PCA + k-d Tree Algorithm on a set of images

In [12]:
neighbors = NearestNeighbors(n_neighbors=5,
                             algorithm='kd_tree').fit(feature_list_compressed)
%timeit neighbors.kneighbors(feature_list_compressed)

11.1 s ± 273 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


PCA + Ball Tree Algorithm on a set of images

In [33]:
neighbors = NearestNeighbors(
    n_neighbors=5, algorithm='ball_tree').fit(feature_list_compressed)
%timeit neighbors.kneighbors(random_feature_list_compressed)

1.62 s ± 220 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


# Annoy

In [3]:
from annoy import AnnoyIndex

ModuleNotFoundError: No module named 'annoy'

In [2]:
# Time the indexing for Annoy
t = AnnoyIndex(2048)  # Length of item vector that will be indexed
starttime = time.time()
for i in range(num_images):
    feature = feature_list[i]
    t.add_item(i, feature)
endtime = time.time()
print(endtime - starttime)
t.build(40)  # 50 trees
t.save('data/caltech101index.ann')

NameError: name 'AnnoyIndex' is not defined

Annoy on one image

In [None]:
u = AnnoyIndex(2048)
%timeit u.get_nns_by_vector(feature_list[random_image_index], 5, include_distances=True)
indexes = u.get_nns_by_vector(feature_list[random_image_index],
                              5,
                              include_distances=True)

In [None]:
def calculate_annoy_time():
    for i in range(0, 100):
        indexes = u.get_nns_by_vector(feature_list[random_image_index],
                                      5,
                                      include_distances=True)

Annoy on a set of images

In [None]:
%time calculate_annoy_time()

PCA + Annoy

In [None]:
starttime = time.time()
# Length of item vector that will be indexed
t = AnnoyIndex(num_feature_dimensions)

for i in range(num_images):
    feature = feature_list_compressed[i]
    t.add_item(i, feature)
endtime = time.time()
print(endtime - starttime)
t.build(40)  # 50 trees
t.save('data/caltech101index.ann')

PCA + Annoy for one image

In [None]:
u = AnnoyIndex(num_feature_dimensions)
%timeit u.get_nns_by_vector(feature_list_compressed[random_image_index], 5, include_distances=True)
indexes = u.get_nns_by_vector(feature_list_compressed[random_image_index],
                              5,
                              include_distances=True)

In [None]:
def calculate_annoy_time():
    for i in range(0, 100):
        indexes = u.get_nns_by_vector(
            feature_list_compressed[random_image_index],
            5,
            include_distances=True)

PCA + Annoy on a set of images

In [4]:
%time calculate_annoy_time()

NameError: name 'calculate_annoy_time' is not defined