In [1]:
!pip install sentence-transformers

Collecting sentence-transformers
  Downloading sentence-transformers-2.2.0.tar.gz (79 kB)
[K     |████████████████████████████████| 79 kB 3.7 MB/s 
[?25hCollecting transformers<5.0.0,>=4.6.0
  Downloading transformers-4.17.0-py3-none-any.whl (3.8 MB)
[K     |████████████████████████████████| 3.8 MB 29.3 MB/s 
Collecting sentencepiece
  Downloading sentencepiece-0.1.96-cp37-cp37m-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.2 MB)
[K     |████████████████████████████████| 1.2 MB 39.5 MB/s 
[?25hCollecting huggingface-hub
  Downloading huggingface_hub-0.4.0-py3-none-any.whl (67 kB)
[K     |████████████████████████████████| 67 kB 4.8 MB/s 
Collecting tokenizers!=0.11.3,>=0.11.1
  Downloading tokenizers-0.11.6-cp37-cp37m-manylinux_2_12_x86_64.manylinux2010_x86_64.whl (6.5 MB)
[K     |████████████████████████████████| 6.5 MB 31.5 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.47-py2.py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 39.6 MB/s 


In [2]:
import os
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
model = SentenceTransformer("sentence-transformers/distilbert-base-nli-stsb-mean-tokens")

Downloading:   0%|          | 0.00/345 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/4.01k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/555 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/229 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/265M [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/505 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [6]:
df = pd.read_excel("/content/Marvel Datastore.xlsx")
df.head()

Unnamed: 0,Sentences
0,"Thor Odinson is the Asgardian God of Thunder, ..."
1,"Upon being welcomed back to Asgard as a hero, ..."
2,Thor returned to Asgard having defeated his br...
3,Loki Laufeyson was the biological son of Laufe...
4,"Transported by the wormhole to Sanctuary, Loki..."


### similarity function

In [107]:
def find_similarity(sentnece,allsentence,k=1):
  similarity_matrix = cosine_similarity(sentnece,allsentence)
  similarity = similarity_matrix[0]
  if k==1:
    return [np.argmax(similarity)]
  elif k is not None:
    return np.flip(similarity.argsort()[-k:])

### embedding

In [108]:
paragraph = df.iloc[:,0]
embedding_distillbert = model.encode(paragraph.values)

In [109]:
len(embedding_distillbert)

14

In [110]:
embedding_distillbert[0].shape

(768,)

### find similarities between vectors

In [111]:
search_string = "thor's weapon"
search_vector = model.encode([search_string])

In [112]:
search_vector[0].shape

(768,)

In [113]:
k=3
distilbert_similar_indexes = find_similarity(search_vector,embedding_distillbert,k)

In [114]:
distilbert_similar_indexes

array([7, 6, 1])

In [115]:
output_data = []
for index in distilbert_similar_indexes:
  output_data.append(paragraph[index])


In [117]:
output_data[0]

'Stormbreaker is an enchanted axe used by Thor. It was forged from Uru on Nidavellir, and can summon the Bifrost.'

In [4]:
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']
sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

In [5]:
embeddings1 = model.encode(sentences1,convert_to_tensor= True)
embeddings2 = model.encode(sentences2, convert_to_tensor=True)

In [6]:
from sentence_transformers import SentenceTransformer,util

In [7]:
# compute cosine similarity
cosine_score = util.cos_sim(embeddings1,embeddings2)

In [8]:
cosine_score 

tensor([[ 0.2166,  0.1837, -0.0393],
        [-0.2488, -0.0146,  0.0590],
        [-0.1100,  0.0797,  0.9816]], device='cuda:0')

In [10]:
cosine_score2 = util.cos_sim(embeddings2,embeddings1)

In [11]:
cosine_score2

tensor([[ 0.2166, -0.2488, -0.1100],
        [ 0.1837, -0.0146,  0.0797],
        [-0.0393,  0.0590,  0.9816]], device='cuda:0')

In [17]:
for i in range(len(sentences1)):
  print("{} \t\t {} score: {}".format(sentences1[i],sentences2[i],cosine_score[i][i]))

The cat sits outside 		 The dog plays in the garden score: 0.21662883460521698
A man is playing guitar 		 A woman watches TV score: -0.014647542499005795
The new movie is awesome 		 The new movie is so great score: 0.9816268086433411


In [27]:
# Single list of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

In [28]:
embedding = model.encode(sentences, convert_to_tensor=True)

In [23]:
cosine_score = util.cos_sim(embedding,embedding)

In [29]:
cosine_score

tensor([[ 1.0000, -0.0530,  0.0681, -0.0270,  0.6247,  0.1837, -0.0393,  0.0523],
        [-0.0530,  1.0000, -0.1532,  0.0652, -0.2105, -0.0146,  0.0590,  0.1080],
        [ 0.0681, -0.1532,  1.0000,  0.2526,  0.2455,  0.0563,  0.2605,  0.1997],
        [-0.0270,  0.0652,  0.2526,  1.0000, -0.0458,  0.0797,  0.9816,  0.0829],
        [ 0.6247, -0.2105,  0.2455, -0.0458,  1.0000,  0.1760, -0.0591, -0.0733],
        [ 0.1837, -0.0146,  0.0563,  0.0797,  0.1760,  1.0000,  0.1008, -0.0636],
        [-0.0393,  0.0590,  0.2605,  0.9816, -0.0591,  0.1008,  1.0000,  0.0516],
        [ 0.0523,  0.1080,  0.1997,  0.0829, -0.0733, -0.0636,  0.0516,  1.0000]],
       device='cuda:0')

In [41]:
#Find the pairs with the highest cosine similarity scores
pairs = []
for i in range(len(cosine_score)-1):
  for j in range(i+1,len(cosine_score)):
    pairs.append({"index":[i,j],"score":cosine_score[i][j]})