#### Here will be covered : 
- Sentence Embedding
- Sentence Similarity
- Semantic Search
- Clustering

In [None]:
%pip install -U sentence-transformers

# sentence-transformers (BERT) is a library for computing sentence embeddings (sentence vectors) in Python and PyTorch. 

**Generate Embeding**

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('all-MiniLM-L6-v2')
# all-MiniLM-L6-v2 is a transformer model trained on a large corpus of English sentences and their paraphrases. It is a smaller version of the MiniLM model (6 layers instead of 12 layers). It was trained on the AllNLI and STS benchmark datasets. And it is the best performing model on the STS benchmark dataset.

In [None]:
sentences = ['This framework generates embeddings for each input sentence','Sentences are passed as a list of string.', 'The quick brown fox jumps over the lazy dog.']

embeddings = model.encode(sentences, convert_to_tensor=True)

for sentence, embedding in zip(sentences, embeddings):
    print("Sentence:", sentence)
    print("Embedding:", embedding)
    print("")

In [None]:
text1 = """"""
text2 = """"""
emd1 = model.encode("Artificial intelligence is the simulation of human intelligence processes by machines, especially computer systems. Specific applications of AI include expert systems, natural language processing, speech recognition and machine vision.")
emd2 = model.encode("Natural language processing (NLP) is a branch of artificial intelligence (AI) that enables computers to comprehend, generate, and manipulate human language. Natural language processing has the ability to interrogate the data with natural language text or voice.")
# emd2 = model.encode("Tom deserves unbiased judgement")


**Cosine Similarity**

In [None]:
cos_similarity = util.cos_sim(emd1, emd2)
cos_similarity

**Compute Cosine Similarity Between all pairs**

In [None]:
sentences = [
    'A man is eating food.',
    'A man is eating a piece of bread.',
    'A man is riding a horse.',
    'The girl is carrying a baby.',
    'A woman is playing violin.',
    'two men pushed carts through the woods.',
    'A man is riding a white horse on an enclosed ground.',
    'A monkey is playing drums.',
    'A cheetah is running behind its prey.'
]

In [None]:
# Enode sentences to get their embeddings
embeddings = model.encode(sentences, convert_to_tensor=True)

# Compute cosine similarity between all pairs
cosine_scores = util.cos_sin(embeddings, embeddings)

In [None]:
# add all pairs to a list with their cosine similarity score
all_sentence_combinations = []
for i in range(len(cosine_scores)-1):
    for j in range(i+1, len(cosine_scores)):
        all_sentence_combinations.append([cosine_scores[i][j], i, j])

# Sort in decreasing order of the cosine similarity score
all_sentence_combinations = sorted(all_sentence_combinations, key=lambda x: x[0], reverse=True)

# x[0] is the criteria for sorting

In [None]:
# print the pairs according to their cosine similarity score
print("Top-5 most similar pairs:")
for score, i, j in all_sentence_combinations[0:5]:
    print("{} \t\t {} \t\t {:.4f}".format(sentences[i], sentences[j], cosine_scores[i][j]))

#### Semantic Search

In [None]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('clips/mfaq')

In [None]:
question = "<Q>How many models can I host on Hugging Face?</Q>"
answer_1 = "<A>All plans come with unlimited private models and datasets."
answer_2 = "<A>AutoNLP is an automatic way to train and deploy state-of-the-art NLP models, seamlessly integrated with the Hugging Face ecosystem."
answer_3 = "<A>Based on how much training data and model variants are created, we send you a compute cost and payment link - as low as $10 per job."

In [None]:
query_embedding = model.encode(question)
corpus_embeddings = model.encode([answer_1, answer_2, answer_3])

results = util.semantic_search(query_embedding, corpus_embeddings)

In [None]:
from transformers import pipeline