# Neural Information Retrieval Using Nearest Neighbor


Neural retrieval for information retrieval (IR) use embeddings to search for similar items in an search space build using search corpus data.

<br/>
We will FAISS library from Facebook for our hands-on presentation. But one could look <a href="http://ann-benchmarks.com/">here</a> to find other solutions and benchmark data.


We will look at `performance` and `recall@1`

## Load the libraries

In [183]:
from pathlib import Path
import numpy as np
import pandas as pd
import faiss
import datasets
from elasticsearch import Elasticsearch
from elasticsearch.helpers import streaming_bulk
import tqdm
import rich
from sentence_transformers import SentenceTransformer
from IPython.display import Image, JSON
from IPython.core.display import HTML
import requests
import ipyplot
import time

## Load the embeddings of the image corpus

In [7]:
dset = datasets.load_from_disk("../data/processed_embeddings")
## these embeddings will be used to create the search space.
corpus = dset['embeddings']


corpus = np.array(corpus).astype('float32')
corpus = np.unique(corpus, axis=0)

In [8]:
corpus.shape

(24954, 512)

In [9]:
corpus

array([[-0.84421384,  0.16233554,  0.30725527, ...,  0.4009103 ,
        -0.20468222, -0.11151451],
       [-0.82529694,  0.26502076,  0.05470029, ...,  0.27426323,
        -0.56668675, -0.08001336],
       [-0.8063227 , -0.1909807 , -0.2237772 , ...,  0.25489452,
        -0.12389579,  0.2758569 ],
       ...,
       [ 0.9416138 ,  0.25078082,  0.20232335, ...,  0.359342  ,
        -0.11814606, -0.41626814],
       [ 0.95560956,  0.44172806,  0.06752466, ...,  0.00269201,
         0.08827078, -0.36235633],
       [ 1.0331681 ,  0.42330468, -0.11260845, ...,  0.16301972,
         0.00906926, -0.25052622]], dtype=float32)

In [10]:
dimension = corpus.shape[-1]
dimension

512

In [115]:
model_name = 'sentence-transformers/clip-ViT-B-32'

In [118]:
model = SentenceTransformer(model_name)

In [13]:
ELASTIC_HOST="localhost"
ELASTIC_INDEX="unsplash"
ELASTIC_PORT=9200

ELASTIC_FULL_URL =f"http://{ELASTIC_HOST}:{ELASTIC_PORT}"

In [14]:

client = Elasticsearch(
    [ELASTIC_FULL_URL]
)

In [92]:
?client.indices.create

[0;31mSignature:[0m
[0mclient[0m[0;34m.[0m[0mindices[0m[0;34m.[0m[0mcreate[0m[0;34m([0m[0;34m[0m
[0;34m[0m    [0;34m*[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mindex[0m[0;34m:[0m [0mstr[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0maliases[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mMapping[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mAny[0m[0;34m][0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0merror_trace[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mbool[0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[0m [0;32mNone[0m[0;34m,[0m[0;34m[0m
[0;34m[0m    [0mfilter_path[0m[0;34m:[0m [0mUnion[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0mList[0m[0;34m[[0m[0mstr[0m[0;34m][0m[0;34m,[0m [0mTuple[0m[0;34m[[0m[0mstr[0m[0;34m,[0m [0;34m...[0m[0;34m][0m[0;34m,[0m [0mNoneType[0m[0;34m][0m [0;34m=[

In [240]:
def create_index_brute(client,index:str, num_shards=3):
    """Creates an index in Elasticsearch. Delete old index."""
    
    if client.indices.exists(index=index):
        client.indices.delete(index=index)
    
    client.indices.create(
        index=index
        ,settings = {"number_of_shards": num_shards}
        ,mappings= {
            
            "properties": {
                        "embeddings": { "type": "dense_vector","dims": 512 , "index":False }
                        ,"description_final": {"type": "text"}
                   }
            }

    )
    
def create_index_knn(client,index:str, num_shards=3):
    """Creates an index in Elasticsearch. Delete old index."""
    
    if client.indices.exists(index=index):
        client.indices.delete(index=index)
    
    client.indices.create(
        index=index
        ,settings = {"number_of_shards": num_shards}
        ,mappings= {
            
            "properties": {
                        "embeddings": { "type": "dense_vector","dims": 512 , "index": True, "similarity":"cosine" }
                        ,"description_final": {"type": "text"}
                   }
            }

    )


def generate_docs(df:pd.DataFrame):
    """
    Given a datframe containing posts data, yields a generator of dicitionary 
    """
    
    # iterate over dataframe contains posts with metadata
    df = df[['photo_id','description_final','photo_image_url' ,'embeddings']]
    for index, row in df.iterrows():
        doc = {**row} 
        
        # use PostId as document id
        doc['_id'] = doc["photo_id"]
        
        yield doc

In [241]:
create_index_brute(client, index= ELASTIC_INDEX, num_shards=1)

In [242]:
df_subset = dset.to_pandas()

In [243]:
df_subset.head()

Unnamed: 0,photo_id,photo_url,photo_image_url,photo_submitted_at,photo_featured,photo_width,photo_height,photo_aspect_ratio,photo_description,photographer_username,...,stats_downloads,ai_description,ai_primary_landmark_name,ai_primary_landmark_latitude,ai_primary_landmark_longitude,ai_primary_landmark_confidence,blur_hash,description_final,image,embeddings
0,XMyPniM9LF0,https://unsplash.com/photos/XMyPniM9LF0,https://images.unsplash.com/uploads/1411949294...,2014-09-29 00:08:38.594364,t,4272,2848,1.5,Woman exploring a forest,michellespencer77,...,6967,woman walking in the middle of forest,,,,,L56bVcRRIWMh.gVunlS4SMbsRRxr,Woman exploring a forest,"{'bytes': None, 'path': '../data/raw/images/XM...","[-0.23112005, 0.67399395, 0.16962554, 0.137191..."
1,rDLBArZUl1c,https://unsplash.com/photos/rDLBArZUl1c,https://images.unsplash.com/photo-141633941111...,2014-11-18 19:36:57.08945,t,3000,4000,0.75,Succulents in a terrarium,ugmonk,...,82141,succulent plants in clear glass terrarium,,,,,LvI$4txu%2s:_4t6WUj]xat7RPoe,Succulents in a terrarium,"{'bytes': None, 'path': '../data/raw/images/rD...","[-0.45541775, 0.42470074, -0.09068765, 0.14057..."
2,cNDGZ2sQ3Bo,https://unsplash.com/photos/cNDGZ2sQ3Bo,https://images.unsplash.com/photo-142014251503...,2015-01-01 20:02:02.097036,t,2564,1710,1.5,Rural winter mountainside,johnprice,...,3428,rocky mountain under gray sky at daytime,,,,,LhMj%NxvM{t7_4t7aeoM%2M{ozj[,Rural winter mountainside,"{'bytes': None, 'path': '../data/raw/images/cN...","[-0.25890213, 0.642249, 0.095262825, 0.1934405..."
3,iuZ_D1eoq9k,https://unsplash.com/photos/iuZ_D1eoq9k,https://images.unsplash.com/photo-141487280988...,2014-11-01 20:15:13.410073,t,2912,4368,0.67,Poppy seeds and flowers,krisatomic,...,33704,red common poppy flower selective focus phography,,,,,LSC7DirZAsX7}Br@GEWWmnoLWCnj,Poppy seeds and flowers,"{'bytes': None, 'path': '../data/raw/images/iu...","[0.024865545, 0.15010555, -0.21668568, -0.3788..."
4,BeD3vjQ8SI0,https://unsplash.com/photos/BeD3vjQ8SI0,https://images.unsplash.com/photo-141700759404...,2014-11-26 13:13:50.134383,t,4896,3264,1.5,Silhouette near dark trees,jonaseriksson,...,49662,trees during night time,,,,,L25|_:V@0hxtI=W;odae0ht6=^NG,Silhouette near dark trees,"{'bytes': None, 'path': '../data/raw/images/Be...","[-0.4102839, 0.0026709028, 0.07259746, 0.11741..."


In [244]:
number_of_docs = len(df_subset)

In [245]:
with tqdm.auto.tqdm(total=number_of_docs , unit="docs" ) as pbar:
    successes = 0


    for ok, action in streaming_bulk(
            client=client, index=ELASTIC_INDEX, actions=generate_docs(df_subset) ,
        ):
        pbar.update(1)
        successes += ok

  0%|          | 0/24992 [00:00<?, ?docs/s]

In [246]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/{ELASTIC_INDEX}/_mapping").json()
    
)

In [250]:
def fetch_results(client:Elasticsearch, query:str,  num_hits=5, fields = ["description_final"], explain=False, use_brute_force=True):
    """
    With the passed elastic search client, return documents that contain the passed `query` in the fields specified by `fields`

    If the fields is empty, it will search all text fields
    
    We are using mult-match, which by default uses `or`
    https://www.elastic.co/guide/en/elasticsearch/reference/current/query-dsl-match-query.html
    """

    print (query)
    query_vector = model.encode(query).tolist()

    
    payload_knn = None
    payload_brute_force = None
    
    if not use_brute_force:
        payload_knn = {
                "field": "embeddings",
                "query_vector": query_vector,
                "k": 10,
                "num_candidates": 100
        }
    else:
        payload_brute_force = {
            "script_score": {
                      "query": { "match_all": {} },

                      "script": {
                        "source": "cosineSimilarity(params.query_vector, 'embeddings') + 1.0", 
                        "params": {
                          "query_vector": query_vector
                        },

                      }
                    }
        }
    

    
    resp = client.search(
        query= payload_brute_force
        , knn=payload_knn
        , fields=fields
        ,size = num_hits
    )
    
    return resp

def find_results(query:str , k =5, explain=False, use_brute_force=True):
    
    time_start=time.time_ns()
    top_items = fetch_results(client,query=query,num_hits=k, use_brute_force=use_brute_force)
    time_end=time.time_ns()
    
    elapsed_time_ms = (time_end - time_start) // 1_000_000
    

    display(HTML(f"<h3>Query: {query} </h3>"))
    

    images = []
    labels = []
    
    # Iterate over the top k results
    for hit in top_items['hits']['hits']:
        doc_id = hit['_id']
        
        photo_data = hit["_source"]
        
        # Display the photo
        
        images.append(photo_data["photo_image_url"])
        score = "{:.2f}".format(hit['_score'])
        
        labels.append (f"""
                     Photo title: {photo_data["description_final"]}   <br/>
                     Distance: {score}
            
                     """)
    
        
    ipyplot.plot_images(images=images, labels=labels, img_width=200)
        
        
    if explain:
        return JSON (top_items.body , expanded=False)
    
    print(f"Elapsed tims(ms): {elapsed_time_ms} ")


In [251]:
find_results("boy playing in the beach", use_brute_force=True)

boy playing in the beach


Elapsed tims(ms): 56 


In [253]:
create_index_knn(client, index= ELASTIC_INDEX, num_shards=1)

In [254]:
with tqdm.auto.tqdm(total=number_of_docs , unit="docs" ) as pbar:
    successes = 0


    for ok, action in streaming_bulk(
            client=client, index=ELASTIC_INDEX, actions=generate_docs(df_subset) ,
        ):
        pbar.update(1)
        successes += ok

  0%|          | 0/24992 [00:00<?, ?docs/s]

In [255]:
rich.print (
    requests.get(f"{ELASTIC_FULL_URL}/{ELASTIC_INDEX}/_mapping").json()
    
)

In [257]:
find_results("boy playing in the beach", use_brute_force=False)

boy playing in the beach


Elapsed tims(ms): 36 


## Create the <a href ="https://github.com/facebookresearch/faiss">FAISS</a> index. 
<br/>FAISS will be used to create the vector search space using the embeddings corpus.
<br/>We are going to use FlatIndex to store the index, but <a href="https://github.com/facebookresearch/faiss/wiki/Faiss-indexes">other efficient indexes</a> are also available in FAISS.
<br/> FAISS supports both GPU and CPU based index. GPU index search is <a href="https://github.com/facebookresearch/faiss/wiki/Comparing-GPU-vs-CPU">comparatively faster </a> compared to CPU as long as complete index could fit in memory.

### Flat Index

In [None]:
xb = corpus
xb.shape
dimension = xb.shape[-1]
#initialize the flat index for data dimension. In current example it is 512
index = faiss.IndexFlatL2(dimension)


In [None]:
index.is_trained


In [None]:
# add data to the index. This is a CPU based index.
index.add(xb)                


In [None]:
xq = xb


In [None]:
len(xq)


number of vectors / results to retrieve

In [None]:
k =1

#### Index Search
search method returns query indices (I) similar to search query vector and their euclidean distances (D) from the search query vector.

search for single vector

In [None]:
%%timeit
D, I = index.search(xq[:1], k)   

search for all vectors in corpus

In [None]:
%%time
D, I = index.search(xq, k)     

distance of vector in corpus to query vector

In [None]:
D

top vertex id 



In [None]:
I

because we are using the entire corpus and the ids are sequential, the ideal recall would be sequential too

In [None]:
z = I[:,0] == np.array( list(range(len(xq))))
z

In [None]:
{
 "recall@1":  z.sum()
 , "num_vectors":  len(z)
 , "mismatch":    len(z) - z.sum()
}


In [None]:
np.array( list(range(len(xq))))[z]

In [None]:
z

In [None]:
np.where(z == False)

In [None]:
D[3766]

In [None]:
I[3766]

### faiss ivf

<img src="https://d33wubrfki0l68.cloudfront.net/44acb1425f25e30ca058daec92bdb209c6c47ad2/e92fc/images/faiss5.png" width="500"/>

<p> Image from Pinecone Faiss Tutorial </p>
https://www.pinecone.io/learn/faiss-tutorial/


**Parameters**:
- nlist : number of clusters
- nprobe: number of clusters to search

In [None]:
nlist = 20 # number of clusters
quantizer = faiss.IndexFlatL2(dimension)  # the other index
index = faiss.IndexIVFFlat(quantizer, dimension, nlist, faiss.METRIC_L2)

In [None]:
assert not index.is_trained
index.train(xb)
assert index.is_trained

In [None]:
xq = xb
# create another index for IVFFlat
index.add(xb)         


search for single vector

In [None]:
%%timeit


index.nprobe = 1              # default nprobe is 1

D, I = index.search(xq[:1], k)     # actual search

search for entire corpus

In [None]:
%%time


index.nprobe = 1              

D, I = index.search(xq, k)     # actual search

In [None]:
z = I[:,0] == np.array( list(range(len(xq))))
{
 "recall@1":  z.sum()
 , "num_vectors":  len(z)
 , "mismatch":    len(z) - z.sum()
}


increase the number of cells that are probed

In [None]:
%%time
index.nprobe = 5              # default nprobe is 1

D, I = index.search(xq, k)    

In [None]:
z = I[:,0] == np.array( list(range(len(xq))))
{
 "recall@1":  z.sum()
 , "num_vectors":  len(z)
 , "mismatch":    len(z) - z.sum()
}
