In [None]:
!pip install -U sentence-transformers

In [16]:
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer("all-MiniLM-L6-v2")

model

SentenceTransformer(
  (0): Transformer({'max_seq_length': 256, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 384, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

In [None]:
sentences = ['This framework generates embeddings for each input sentence',
             'sentences are passed as a list of string']

embeddings = model.encode(sentences)

for sentence, embedding in zip(sentences, embeddings):
  print("Sentence: ", sentence)
  print("Embedding: ", embeddings)

In [None]:
print(embeddings)

In [25]:
emb1 = model.encode("I am eating Apple")
emb2 = model.encode("I like fruits")

cos_sim = util.cos_sim(emb1, emb2)
print("Cosine-Similarity: ", cos_sim)

Cosine-Similarity:  tensor([[0.5398]])


In [30]:
sentences = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'The girl is carrying a baby.',
          'A man is riding a horse.',
          'A woman is playing violin.',
          'Two men pushed carts through the woods.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.'
          ]

embeddings = model.encode(sentences)

cos_sim = util.cos_sim(embeddings, embeddings)

In [31]:
cos_sim

tensor([[ 1.0000,  0.7553, -0.1050,  0.2474, -0.0704, -0.0333,  0.1707,  0.0476,
          0.0630],
        [ 0.7553,  1.0000, -0.0610,  0.1442, -0.0809, -0.0216,  0.1157,  0.0362,
          0.0216],
        [-0.1050, -0.0610,  1.0000, -0.1088,  0.0217, -0.0413, -0.0928,  0.0231,
          0.0247],
        [ 0.2474,  0.1442, -0.1088,  1.0000, -0.0348,  0.0362,  0.7369,  0.0821,
          0.1389],
        [-0.0704, -0.0809,  0.0217, -0.0348,  1.0000, -0.1654, -0.0592,  0.1961,
          0.2564],
        [-0.0333, -0.0216, -0.0413,  0.0362, -0.1654,  1.0000,  0.0769, -0.0380,
         -0.0895],
        [ 0.1707,  0.1157, -0.0928,  0.7369, -0.0592,  0.0769,  1.0000,  0.0495,
          0.1191],
        [ 0.0476,  0.0362,  0.0231,  0.0821,  0.1961, -0.0380,  0.0495,  1.0000,
          0.6433],
        [ 0.0630,  0.0216,  0.0247,  0.1389,  0.2564, -0.0895,  0.1191,  0.6433,
          1.0000]])

In [35]:
len(cos_sim)

9

In [39]:
x = range(len(cos_sim)-1)
for n in x:
  print(n)

0
1
2
3
4
5
6
7


In [34]:
all_sentences_combinations = []
for i in range(len(cos_sim)-1):
  for j in range(i+1, len(cos_sim)):
    all_sentences_combinations.append((cos_sim[i][j], i, j))

all_sentences_combinations

[(tensor(0.7553), 0, 1),
 (tensor(-0.1050), 0, 2),
 (tensor(0.2474), 0, 3),
 (tensor(-0.0704), 0, 4),
 (tensor(-0.0333), 0, 5),
 (tensor(0.1707), 0, 6),
 (tensor(0.0476), 0, 7),
 (tensor(0.0630), 0, 8),
 (tensor(-0.0610), 1, 2),
 (tensor(0.1442), 1, 3),
 (tensor(-0.0809), 1, 4),
 (tensor(-0.0216), 1, 5),
 (tensor(0.1157), 1, 6),
 (tensor(0.0362), 1, 7),
 (tensor(0.0216), 1, 8),
 (tensor(-0.1088), 2, 3),
 (tensor(0.0217), 2, 4),
 (tensor(-0.0413), 2, 5),
 (tensor(-0.0928), 2, 6),
 (tensor(0.0231), 2, 7),
 (tensor(0.0247), 2, 8),
 (tensor(-0.0348), 3, 4),
 (tensor(0.0362), 3, 5),
 (tensor(0.7369), 3, 6),
 (tensor(0.0821), 3, 7),
 (tensor(0.1389), 3, 8),
 (tensor(-0.1654), 4, 5),
 (tensor(-0.0592), 4, 6),
 (tensor(0.1961), 4, 7),
 (tensor(0.2564), 4, 8),
 (tensor(0.0769), 5, 6),
 (tensor(-0.0380), 5, 7),
 (tensor(-0.0895), 5, 8),
 (tensor(0.0495), 6, 7),
 (tensor(0.1191), 6, 8),
 (tensor(0.6433), 7, 8)]

In [42]:
#sorting list by the highest cosine similarity score
all_sentences_combinations = sorted(all_sentences_combinations, key=lambda x:x[0], reverse=True)
# all_sentences_combinations
print("Top 5 most similar pairs: ")
for score, i, j in all_sentences_combinations[0:5]:
  print("{} \t {} \t {:.4f}".format(sentences[i], sentences[j], cos_sim[i][j]))

Top 5 most similar pairs: 
A man is eating food. 	 A man is eating a piece of bread. 	 0.7553
A man is riding a horse. 	 A man is riding a white horse on an enclosed ground. 	 0.7369
A monkey is playing drums. 	 Someone in a gorilla costume is playing a set of drums. 	 0.6433
A woman is playing violin. 	 Someone in a gorilla costume is playing a set of drums. 	 0.2564
A man is eating food. 	 A man is riding a horse. 	 0.2474


###Semantic Search

In [43]:
question = "How many models can I host on HuggingFace?"
answer_1 = "All plans come with unlimited private models and datasets."
answer_2 = "AutoNLP is an automatic way to train and deploy state-of-the-art NLP models, seamlessly integrated with the Hugging Face ecosystem."
answer_3 = "Based on how much training data and model variants are created, we send you a compute cost and payment link - as low as $10 per job."

query_embedding = model.encode(question)
corpus_embeddings = model.encode([answer_1, answer_2, answer_3])

print(util.semantic_search(query_embedding, corpus_embeddings))

[[{'corpus_id': 0, 'score': 0.35359811782836914}, {'corpus_id': 1, 'score': 0.3143519163131714}, {'corpus_id': 2, 'score': 0.24975530803203583}]]


In [44]:
from transformers import pipeline

In [46]:
qa_model = pipeline("question-answering")
question = "How many models can I host on HuggingFace?"
context = "All plans come with unlimited private models and datasets"
qa_model(question=question, context=context)

No model was supplied, defaulted to distilbert-base-cased-distilled-squad and revision 626af31 (https://huggingface.co/distilbert-base-cased-distilled-squad).
Using a pipeline without specifying a model name and revision in production is not recommended.


{'score': 0.7239824533462524, 'start': 20, 'end': 29, 'answer': 'unlimited'}

In [48]:
from sklearn.cluster import KMeans
import numpy as np

embedder = SentenceTransformer("all-MiniLM-L6-v2")

# Corpus with example sentences
corpus = ['A man is eating food.',
          'A man is eating a piece of bread.',
          'Horse is eating grass.',
          'A man is eating pasta.',
          'A Woman is eating Biryani.',
          'The girl is carrying a baby.',
          'The baby is carried by the woman',
          'A man is riding a horse.',
          'A man is riding a white horse on an enclosed ground.',
          'A monkey is playing drums.',
          'Someone in a gorilla costume is playing a set of drums.',
          'A cheetah is running behind its prey.',
          'A cheetah chases prey on across a field.',
          'The cheetah is chasing a man who is riding the horse.',
          'man and women with their baby are watching cheetah in zoo'
          ]

corpus_embeddings = embedder.encode(corpus)

corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings, axis=1, keepdims=True)

In [None]:
corpus_embeddings[0]

In [65]:
clustering_model = KMeans(n_clusters=3)
clustering_model.fit(corpus_embeddings)
cluster_assignment = clustering_model.labels_
print(cluster_assignment)

[1 1 0 1 1 2 2 0 0 0 0 0 0 0 0]




In [70]:
for sentence_id, cluster_id in enumerate(cluster_assignment):
  print("Sentence ID: {} Cluster ID: {}".format(sentence_id, cluster_id))

Sentence ID: 0 Cluster ID: 1
Sentence ID: 1 Cluster ID: 1
Sentence ID: 2 Cluster ID: 0
Sentence ID: 3 Cluster ID: 1
Sentence ID: 4 Cluster ID: 1
Sentence ID: 5 Cluster ID: 2
Sentence ID: 6 Cluster ID: 2
Sentence ID: 7 Cluster ID: 0
Sentence ID: 8 Cluster ID: 0
Sentence ID: 9 Cluster ID: 0
Sentence ID: 10 Cluster ID: 0
Sentence ID: 11 Cluster ID: 0
Sentence ID: 12 Cluster ID: 0
Sentence ID: 13 Cluster ID: 0
Sentence ID: 14 Cluster ID: 0


In [66]:
clustered_sentences = {}
for sentence_id, cluster_id in enumerate(cluster_assignment):
  if cluster_id not in clustered_sentences:
    clustered_sentences[cluster_id] = []

  clustered_sentences[cluster_id].append(corpus[sentence_id])

clustered_sentences

{1: ['A man is eating food.',
  'A man is eating a piece of bread.',
  'A man is eating pasta.',
  'A Woman is eating Biryani.'],
 0: ['Horse is eating grass.',
  'A man is riding a horse.',
  'A man is riding a white horse on an enclosed ground.',
  'A monkey is playing drums.',
  'Someone in a gorilla costume is playing a set of drums.',
  'A cheetah is running behind its prey.',
  'A cheetah chases prey on across a field.',
  'The cheetah is chasing a man who is riding the horse.',
  'man and women with their baby are watching cheetah in zoo'],
 2: ['The girl is carrying a baby.', 'The baby is carried by the woman']}