# SentenceTransformers SBERT

https://www.sbert.net/index.html


## getting started

In [1]:
from sentence_transformers import SentenceTransformer
from sentence_transformers import SentenceTransformer, util



In [2]:
model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')

In [4]:
#Our sentences we like to encode
sentences = [
    'This framework generates embeddings for each input sentence',
    'Sentences are passed as a list of string.', 
    'The quick brown fox jumps over the lazy dog.'
]

#Sentences are encoded by calling model.encode()
sentence_embeddings = model.encode(sentences)
sentence_embeddings

array([[-0.21486197,  0.39572313,  0.46908724, ..., -0.23118995,
        -0.49579158,  0.42366365],
       [-0.44001684, -0.28488484,  0.23363829, ...,  0.11956105,
        -0.16530281, -0.08625162],
       [-0.29504797, -0.24928916, -0.02407106, ...,  0.11944598,
         0.00626621,  1.0400685 ]], dtype=float32)

In [4]:
sentence_embeddings.shape

(3, 768)

In [5]:
#Sentences are encoded by calling model.encode()
emb1 = model.encode("This is a red cat with a hat.")
emb2 = model.encode("Have you seen my red cat?")

cos_sim = util.pytorch_cos_sim(emb1, emb2)
print("Cosine-Similarity:", cos_sim)

Cosine-Similarity: tensor([[0.6686]])


## paraphrase mining

In [9]:

# Single list of sentences - Possible tens of thousands of sentences
sentences = ['The cat sits outside',
             'A man is playing guitar',
             'I love pasta',
             'The new movie is awesome',
             'The cat plays in the garden',
             'A woman watches TV',
             'The new movie is so great',
             'Do you like pizza?']

paraphrases = util.paraphrase_mining(model, sentences)

for paraphrase in paraphrases[0:10]:
    score, i, j = paraphrase
    print("{} \t\t {} \t\t Score: {:.4f}".format(sentences[i], sentences[j], score))

The new movie is awesome 		 The new movie is so great 		 Score: 0.9816
The cat sits outside 		 The cat plays in the garden 		 Score: 0.6247
The new movie is so great 		 I love pasta 		 Score: 0.2605
I love pasta 		 The new movie is awesome 		 Score: 0.2526
I love pasta 		 The cat plays in the garden 		 Score: 0.2455
I love pasta 		 Do you like pizza? 		 Score: 0.1997
The cat sits outside 		 A woman watches TV 		 Score: 0.1837
The cat plays in the garden 		 A woman watches TV 		 Score: 0.1760
A man is playing guitar 		 Do you like pizza? 		 Score: 0.1080
A woman watches TV 		 The new movie is so great 		 Score: 0.1008


## Multilanguage

In [6]:
modelML = SentenceTransformer('xlm-r-100langs-bert-base-nli-mean-tokens')

100%|██████████| 1.01G/1.01G [00:53<00:00, 19.0MB/s]


In [7]:
tmp = modelML.encode(sentences)

In [8]:
tmp

array([[ 0.11688558,  0.45716816,  1.312417  , ..., -0.48559204,
        -0.7056822 ,  0.43961552],
       [ 0.02169984, -0.06871273,  0.9814437 , ...,  0.05387843,
        -0.21435006,  0.6513395 ],
       [-0.5385539 ,  0.62562305, -0.70758367, ...,  0.19784595,
         0.9747098 ,  0.13501744]], dtype=float32)