In [1]:
from sentence_transformers import SentenceTransformer, util
import torch
import pickle
import faiss
import numpy as np
import time
import tracemalloc
import pandas as pd
from Levenshtein import distance as lev
from Levenshtein import jaro_winkler
from scipy.spatial.distance import jaccard, euclidean, correlation
from scipy.stats import pearsonr, spearmanr

In [2]:
embedder = SentenceTransformer('all-mpnet-base-v2')
with open('D:\\my work\\python\\similarity\\embedding\\embed_file_all.pkl','rb') as fIn:
    cache_data = pickle.load(fIn)
    corpus_sentences = cache_data['sentences']
    corpus_embeddings = cache_data['embeddings']

In [11]:
# all-mpnet-base-v2
# Query sentences:
# queries = ['ابراهيم على']
# ['شيخ محمد علي']
# ['بشار حسين']
queries = ['شيخ محمد علي']

### normal search (slow)

In [None]:

start_time = time.time()

# Find the closest 5 sentences of the corpus for each query sentence based on cosine similarity
top_k = min(60, len(corpus_sentences))
results = dict()
# results_list = list()
for query in queries:
    query_embedding = embedder.encode(query, convert_to_tensor=True)
    # We use cosine-similarity and torch.topk to find the highest 5 scores
    cos_scores = util.cos_sim(query_embedding, corpus_embeddings)[0]
    top_results = torch.topk(cos_scores, k=top_k)
    i = 0
for score, idx in zip(top_results[0], top_results[1]):
    i = i+1
    # print(corpus_sentences[idx], "(Score: {:.4f})".format(score))
    results[i] = (corpus_sentences[idx],score.item()) 
    '''
    results_list.append({'id': i, 'name':corpus_sentences[idx] ,'score':score.item(),
                          'lev_distance' : lev(query, corpus_sentences[idx])/9, 
                          'pearsonr':  pearsonr(query_embedding, corpus_embeddings[idx])[0],
                          'euclidean': 1-euclidean(query_embedding, corpus_embeddings[idx]) 
                        })
results_df = pd.DataFrame(results_list)
'''
#util.community_detection
#util.
end_time = time.time() 
frst_method = end_time-start_time
print(frst_method)  

In [None]:
results

In [None]:
'''
community = util.community_detection(corpus_embeddings, 
                                     threshold = .96,
                                     min_community_size=3, 
                                     batch_size=128)
'''                                     

In [7]:
def name_intersection(name1, name2):
    intersection = len(set(name1.split(' ')) & set(name2.split(' ')))
    union = len(list(set(name1.split(' ')) | set(name1.split(' '))))
    # print(intersection , (len(name1.split(' '))+ len(name2.split(' '))))
    return intersection/(len(name1.split(' '))+ len(name2.split(' ')))

### Index Search faster

In [4]:
corpus_embedding_size  = corpus_embeddings.size()[1]
# why i chose this method is because of dot product #### important
quantizer = faiss.IndexFlatIP( corpus_embedding_size)

#n_clusters a value 4*sqrt(N) to 16*sqrt(N)
index = faiss.IndexIVFFlat(quantizer, corpus_embedding_size, 27, faiss.METRIC_INNER_PRODUCT)
# The nprobe parameter is always a way of adjusting the tradeoff between speed and accuracy of the result. 
# Setting nprobe = nlist gives the same result as the brute-force search (but slower).
# we can compare results on design time between searches from index VS direct search
# on multiple samples to determine the best value
index.nprobe = 3
# normalize embeddings legnth
corpus_embeddings_norm = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1)[:, None]

In [5]:
## this probably takes time 
# release some memory
del corpus_embeddings
# Train index
index.train(corpus_embeddings_norm)
# add all embeddings to the index
index.add(corpus_embeddings_norm)


### start search

In [12]:
top_k_hits = 60
start_time = time.time()
for query in queries:
    query_embedding_ind = embedder.encode(query, convert_to_tensor=True)
    query_embedding_norm =  query_embedding_ind / np.linalg.norm(query_embedding_ind)
    query_embedding_norm = np.expand_dims(query_embedding_norm, axis=0)
    distances, corpus_ids = index.search(query_embedding_norm, top_k_hits)
hits = [{'corpus_id': id, 'score': score} for id, score in zip(corpus_ids[0], distances[0])]
#hits = sorted(hits, key=lambda x: x['score'], reverse=True)
results = dict()
dt = []
for i in range(0, top_k_hits):
    i = i+1
    results[i] = (corpus_sentences[corpus_ids[0][i-1]], distances[0][i-1])
    row = {'id': i, 'name':corpus_sentences[corpus_ids[0][i-1]] ,'cos_sim':distances[0][i-1],
                          'lev_distance' : lev(query, corpus_sentences[corpus_ids[0][i-1]], weights=(1,5,1)),
           'name_intersection':name_intersection(query, corpus_sentences[corpus_ids[0][i-1]])
                        }
    # print(query, '---',   corpus_sentences[corpus_ids[0][i-1]], name_intersection(query, corpus_sentences[corpus_ids[0][i-1]]))
    dt.append(row)
results_df = pd.DataFrame(dt)
end_time = time.time()
scnd_method = end_time-start_time
print(end_time-start_time)    

0.20945954322814941


In [13]:
## results

results_df['lev_dist_norm'] = np.abs((results_df['lev_distance'] - results_df['lev_distance'].max()) / (results_df['lev_distance'].max() - results_df['lev_distance'].min()))
results_df['wavg'] = (results_df['lev_dist_norm']*.1 + results_df['cos_sim']*.1+ results_df['name_intersection']*.8 )/3
results_df['score2'] = np.abs(results_df['wavg'] - .2 / .2 - 0)

results_df = results_df.sort_values(['score2'], ascending=True)
results_df = results_df.head(15)

In [14]:
results_df

Unnamed: 0,id,name,cos_sim,lev_distance,name_intersection,lev_dist_norm,wavg,score2
0,1,شيخ محمد علي,1.0,0,0.5,1.0,0.2,0.8
1,2,شيخ محمد علي,1.0,0,0.5,1.0,0.2,0.8
2,3,شيخ محمد علي,1.0,0,0.5,1.0,0.2,0.8
3,4,شيخ محمد علي,1.0,0,0.5,1.0,0.2,0.8
4,5,شيخ علي محمد,0.993637,8,0.5,0.111111,0.170158,0.829842
5,6,شيخ علي محمد,0.993637,8,0.5,0.111111,0.170158,0.829842
6,7,شيخ علي محمد,0.993637,8,0.5,0.111111,0.170158,0.829842
21,22,شيخ علي محمد علي,0.983719,4,0.428571,0.555556,0.165595,0.834405
27,28,شيخ محمد علي احمد,0.981863,5,0.428571,0.444444,0.161829,0.838171
10,11,شيخ احمد علي,0.987614,1,0.333333,0.888889,0.151439,0.848561
