# Semantic Search

In [21]:
from sentence_transformers import SentenceTransformer
sentences = ["This is an example sentence", "Each sentence is converted"]

model_p = SentenceTransformer('sentence-transformers/paraphrase-MiniLM-L6-v2')
embeddings = model.encode(sentences)
print(embeddings)

[[ 6.76568672e-02  6.34959191e-02  4.87131290e-02  7.93049410e-02
   3.74480300e-02  2.65280716e-03  3.93749066e-02 -7.09849177e-03
   5.93614280e-02  3.15370336e-02  6.00980483e-02 -5.29051758e-02
   4.06067334e-02 -2.59308424e-02  2.98428312e-02  1.12686504e-03
   7.35149309e-02 -5.03819361e-02 -1.22386619e-01  2.37028319e-02
   2.97265071e-02  4.24768291e-02  2.56337821e-02  1.99520611e-03
  -5.69190979e-02 -2.71598138e-02 -3.29035372e-02  6.60248473e-02
   1.19007125e-01 -4.58791070e-02 -7.26214647e-02 -3.25839594e-02
   5.23413345e-02  4.50552590e-02  8.25298857e-03  3.67023908e-02
  -1.39415460e-02  6.53918535e-02 -2.64272355e-02  2.06386510e-04
  -1.36643331e-02 -3.62810269e-02 -1.95043962e-02 -2.89738290e-02
   3.94270495e-02 -8.84091035e-02  2.62431940e-03  1.36713348e-02
   4.83063124e-02 -3.11565641e-02 -1.17329188e-01 -5.11690043e-02
  -8.85287896e-02 -2.18962729e-02  1.42986253e-02  4.44167741e-02
  -1.34815080e-02  7.43392631e-02  2.66382564e-02 -1.98762212e-02
   1.79190

In [22]:
query_embedding = model.encode("That is a happy person")
passage_embedding = model.encode([
            "That is a happy dog",
            "That is a very happy person",
            "Today is a sunny day"
        ])


### Similarity functions

In [23]:
# Dot score
from sentence_transformers import  util
print("Similarity:", util.dot_score(query_embedding, passage_embedding))

Similarity: tensor([[0.6946, 0.9429, 0.2569]])


In [24]:
# Cosine score
print("Similarity:", util.pytorch_cos_sim(query_embedding, passage_embedding))

Similarity: tensor([[0.6946, 0.9429, 0.2569]])


## Semantic Textual Similarity

In [25]:
from sentence_transformers import SentenceTransformer, util
model_sts = SentenceTransformer('all-MiniLM-L6-v2')


In [26]:
# Two lists of sentences
sentences1 = ['The cat sits outside',
             'A man is playing guitar',
             'The new movie is awesome']

sentences2 = ['The dog plays in the garden',
              'A woman watches TV',
              'The new movie is so great']

#Compute embedding for both lists
embeddings1 = model_sts.encode(sentences1, convert_to_tensor=True)
embeddings2 = model_sts.encode(sentences2, convert_to_tensor=True)

#Compute cosine-similarits
cosine_scores = util.pytorch_cos_sim(embeddings1, embeddings2)


In [27]:
#Output the pairs with their score
for i in range(len(sentences1)):
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences1[i], sentences2[i], cosine_scores[i][i]))


The cat sits outside 		 The dog plays in the garden 		 Score: 0.2838
A man is playing guitar 		 A woman watches TV 		 Score: -0.0327
The new movie is awesome 		 The new movie is so great 		 Score: 0.8939


In [28]:
cosine_scores

tensor([[ 0.2838,  0.1310, -0.0029],
        [ 0.2277, -0.0327, -0.0136],
        [ 0.0543, -0.0502,  0.8939]], device='cuda:0')