## Semantic Search

### Imports

In [18]:
from sentence_transformers import SentenceTransformer, util
import pickle
import pandas as pd 
import torch
import time
import faiss
import numpy as np

### Embbeding Using MPNet version2

In [3]:
# Using articles data from kaggle
df = pd.read_csv('../data/train.csv', delimiter=',', nrows = None)
df.head()

Unnamed: 0,ID,TITLE,ABSTRACT,Computer Science,Physics,Mathematics,Statistics,Quantitative Biology,Quantitative Finance
0,1,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...,1,0,0,0,0,0
1,2,Rotation Invariance Neural Network,Rotation invariance and translation invarian...,1,0,0,0,0,0
2,3,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...,0,0,1,0,0,0
3,4,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...,0,0,1,0,0,0
4,5,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...,1,0,0,1,0,0


In [4]:
# since i am only intereseted in abstract search i will remove classification columns
df = df[['TITLE', 'ABSTRACT']]
df.head()

Unnamed: 0,TITLE,ABSTRACT
0,Reconstructing Subject-Specific Effect Maps,Predictive models allow subject-specific inf...
1,Rotation Invariance Neural Network,Rotation invariance and translation invarian...
2,Spherical polyharmonics and Poisson kernels fo...,We introduce and develop the notion of spher...
3,A finite element approximation for the stochas...,The stochastic Landau--Lifshitz--Gilbert (LL...
4,Comparative study of Discrete Wavelet Transfor...,Fourier-transform infra-red (FTIR) spectra o...


In [5]:
df.shape
# 20972 articles

(20972, 2)

In [6]:
embedder = SentenceTransformer('all-mpnet-base-v2')
corpus = df.ABSTRACT.values.tolist()
corpus_embeddings = embedder.encode(corpus, batch_size = 128, convert_to_tensor=True, show_progress_bar = True)

Downloading (…)a8e1d/.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

Downloading (…)0bca8e1d/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

Downloading (…)e1d/data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

Downloading (…)a8e1d/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

Downloading (…)8e1d/train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

Downloading (…)b20bca8e1d/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)bca8e1d/modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

Batches:   0%|          | 0/164 [00:00<?, ?it/s]

In [7]:
# save embedding file
with open('../embbeding/embed_articles.pkl', "wb") as fOut:
        pickle.dump({'sentences': corpus, 'embeddings': corpus_embeddings}, fOut)

## load the embedding file

In [11]:
embedder = SentenceTransformer('all-mpnet-base-v2')
with open('../embbeding/embed_articles.pkl','rb') as fIn:
    cache_data = pickle.load(fIn)
    corpus_sentences = cache_data['sentences']
    corpus_embeddings = cache_data['embeddings']

### try normal search

In [12]:
queries = ['Spherical polyharmonics']
start_time = time.time()

# Find the closest 60 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(60, len(corpus_sentences))
results = dict()
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    # We use cosine-similarity and torch.topk to find the highest scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    i = 0
for score, idx in zip(top_results[0], top_results[1]):
    i = i+1
    results[i] = (corpus_sentences[idx],score.item()) 

end_time = time.time() 
frst_method = end_time-start_time
print(frst_method) 

0.26569509506225586


In [13]:
results

{1: ('  We introduce and develop the notion of spherical polyharmonics, which are a\nnatural generalisation of spherical harmonics. In particular we study the\ntheory of zonal polyharmonics, which allows us, analogously to zonal harmonics,\nto construct Poisson kernels for polyharmonic functions on the union of rotated\nballs. We find the representation of Poisson kernels and zonal polyharmonics in\nterms of the Gegenbauer polynomials. We show the connection between the\nclassical Poisson kernel for harmonic functions on the ball, Poisson kernels\nfor polyharmonic functions on the union of rotated balls, and the Cauchy-Hua\nkernel for holomorphic functions on the Lie ball.\n',
  0.7102088332176208),
 2: ('  We present a family of mutually orthogonal polynomials on the unit ball with\nrespect to an inner product which includes a mass uniformly distributed on the\nsphere. First, connection formulas relating these multivariate orthogonal\npolynomials and the classical ball polynomials are

#### the search returned 60 results in .26 second, which is fast and the first article with the highest score is the most relevant article.

## indexed search faiss

In [34]:
corpus_embedding_size  = corpus_embeddings.size()[1]
# why i chose this method is because of dot product #### important
quantizer = faiss.IndexFlatIP( corpus_embedding_size)

#n_clusters a value 4*sqrt(N) to 16*sqrt(N)
index = faiss.IndexIVFFlat(quantizer, corpus_embedding_size, 27, faiss.METRIC_INNER_PRODUCT)
# The nprobe parameter is always a way of adjusting the tradeoff between speed and accuracy of the result. 
# Setting nprobe = nlist gives the same result as the brute-force search (but slower).
# we can compare results on design time between searches from index VS direct search
# on multiple samples to determine the best value
index.nprobe = 3
# normalize embeddings legnth
corpus_embeddings_norm = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]
# Train index
index.train(corpus_embeddings_norm)
# add all embeddings to the index
index.add(corpus_embeddings_norm)

In [40]:
top_k_hits = 60
start_time = time.time()
for query in queries:
    query_embedding_ind = embedder.encode(query, convert_to_tensor=True)
    query_embedding_norm =  query_embedding_ind
    query_embedding_norm =  query_embedding_ind / np.linalg.norm(query_embedding_ind)
    query_embedding_norm = np.expand_dims(query_embedding_norm, axis=0)
    distances, corpus_ids = index.search(query_embedding_norm, top_k_hits)
hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])]
results = dict()
dt = []
for i in range(0, top_k_hits):
    i = i+1
    results[i] = (corpus_sentences[corpus_ids[0][i-1]], distances[0][i-1])
    row = {'id': i, 'name':corpus_sentences[corpus_ids[0][i-1]] ,'cos_sim':distances[0][i-1]}
    dt.append(row)
results_df = pd.DataFrame(dt)
end_time = time.time()
scnd_method = end_time-start_time
print(end_time-start_time)    

0.04423093795776367


In [41]:
results_df.head()

Unnamed: 0,id,name,cos_sim
0,1,We introduce and develop the notion of spher...,0.710209
1,2,We present a family of mutually orthogonal p...,0.514587
2,3,In this paper we analyze the capacitary pote...,0.496128
3,4,"In this article, a novel analytical approach...",0.48357
4,5,Stationary stellar systems with radially elo...,0.475785


In [43]:
results_df.shape

(60, 3)

#### the index returned 60 results within .045 second which implies a much faster search with the same accuracy as the slow search.