### Load the libraries

In [2]:
from pathlib import Path
import numpy as np
import pandas as pd
import faiss

import datasets

### Load the embeddings of the image corpus

In [3]:
dset = datasets.load_from_disk("../data/processed_embeddings")
corpus = dset['embeddings']

corpus = np.array(corpus).astype('float32')
corpus = np.unique(corpus, axis=0)

In [4]:
corpus.shape

(24958, 512)

In [28]:
# test without corpus. # could be removed.
dimension = 512
corpus = np.random.rand(100_000 , dimension ).astype('float32')
corpus = np.unique(corpus, axis=0)


In [4]:
corpus

array([[-0.8442147 ,  0.1623359 ,  0.3072559 , ...,  0.40091127,
        -0.20468342, -0.11151588],
       [-0.82529676,  0.26502204,  0.05470146, ...,  0.27426067,
        -0.56668615, -0.08001572],
       [-0.8063217 , -0.19098167, -0.22377765, ...,  0.2548957 ,
        -0.12389499,  0.27585742],
       ...,
       [ 0.9416133 ,  0.25078115,  0.20232391, ...,  0.35934162,
        -0.11814432, -0.4162678 ],
       [ 0.9556082 ,  0.44172812,  0.06752564, ...,  0.00269235,
         0.08827013, -0.36235547],
       [ 1.0331686 ,  0.42330503, -0.11260805, ...,  0.16301963,
         0.00906937, -0.25052726]], dtype=float32)

In [5]:
#new_data = np.unique(photo_features, axis=0)


In [None]:
#len(new_data) , len(photo_features)

### Create the <a href ="https://github.com/facebookresearch/faiss">FAISS</a> index. 
<br/>FAISS will be used to create the vector search space using the embeddings corpus.

In [32]:
xb = corpus
xb.shape
d = xb.shape[-1]                       
index = faiss.IndexFlatL2(d)


In [33]:
index.is_trained


True

In [34]:
index.add(xb)                


In [35]:
xq = xb


In [36]:
len(xq)


100000

In [37]:
k =1

In [38]:
%%timeit
D, I = index.search(xq[:1], k)   

73.6 ms ± 62.3 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)


In [39]:
%%time
D, I = index.search(xq, k)     

CPU times: user 8min 26s, sys: 399 ms, total: 8min 27s
Wall time: 2min 53s


In [17]:
D


array([[0.0000000e+00],
       [7.6293945e-05],
       [0.0000000e+00],
       ...,
       [6.1035156e-05],
       [0.0000000e+00],
       [0.0000000e+00]], dtype=float32)

In [18]:
I

array([[    0],
       [    1],
       [    2],
       ...,
       [24955],
       [24956],
       [24957]])

In [19]:
z = I[:,0] == np.array( list(range(len(xq))))

In [20]:
z.sum() , len(z) ,  (  len(z) - z.sum())

(24957, 24958, 1)

In [21]:
np.array( list(range(len(xq))))[z]

array([    0,     1,     2, ..., 24955, 24956, 24957])

In [None]:
z

In [22]:
np.where(z == False)

(array([3766]),)

In [23]:
D[3766]

array([0.], dtype=float32)

In [24]:
I[3766]

array([3765])

## faiss ivf

In [None]:
nlist = 20 # nmber of clusters

quantizer = faiss.IndexFlatL2(d)  # the other index
index = faiss.IndexIVFFlat(quantizer, d, nlist, faiss.METRIC_L2)

In [None]:
assert not index.is_trained
index.train(xb)
assert index.is_trained

In [None]:
xq = xb
index.add(xb)         


In [None]:
%%timeit


index.nprobe = 1              # default nprobe is 1

D, I = index.search(xq[:1], k)     # actual search

In [None]:
%%time


index.nprobe = 1              # default nprobe is 1

D, I = index.search(xq, k)     # actual search

In [None]:
z = I[:,0] == np.array( list(range(len(xq))))
z.sum() , len(z) ,  (  len(z) - z.sum())

In [None]:
%%time
index.nprobe = 5              # default nprobe is 1

D, I = index.search(xq, k)    

In [None]:
z = I[:,0] == np.array( list(range(len(xq))))
z.sum() , len(z) ,  (  len(z) - z.sum())