In [2]:
import warnings
warnings.filterwarnings("ignore")

In [3]:
import numpy as np 
import torch
import seaborn as ns
import scipy

from datasets import load_dataset 

from transformers import BertTokenizer, BertModel
from sklearn.metrics.pairwise import cosine_similarity



## GloVe word embedding 
This embedding has no context for each token. So a word in two difference sentences would have the same token embedding

In [4]:
import gensim.downloader as api
word_vectors = api.load('glove-wiki-gigaword-100')

In [10]:
# closest to doctor - male + female
word_vectors.most_similar(positive=["doctor", "female"], negative=["male"], topn=2)

[('nurse', 0.7539128661155701), ('physician', 0.739092230796814)]

### BERT Model
there is context in tokens due to encoder attention layers

model path: https://huggingface.co/google-bert/bert-base-uncased

In [13]:
model_name = "bert-base-uncased"
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

In [None]:
def get_bert_embeddings(sentence, word):
    inputs = tokenizer(sentence, return_tensors="pt")
    outputs = model(**inputs)
    last_hidden_states = outputs.last_hidden_state
    word_tokens = tokenizer.tokenize(sentence)
    word_index = word_tokens.index(word)
    word_embedding = last_hidden_states[0, word_index+1, :]
    return word_embedding

In [66]:
sentence1 = "The bank is closed on Sundays."
sentence2 = "We sat on the river bank and had a nice meal."
word ="bank"

bert_embedding1 = get_bert_embeddings(sentence1, word).detach().numpy()
bert_embedding2 = get_bert_embeddings(sentence2, word).detach().numpy()
GloVe_embedding = word_vectors[word]

In [68]:
print(f"Bert similarit: {cosine_similarity([bert_embedding1], [bert_embedding2])[0][0]}")
print(f"GloVe embeddings: {cosine_similarity([GloVe_embedding], [GloVe_embedding])[0][0]}")

Bert similarit: 0.44242483377456665
GloVe embeddings: 1.0


### Cross Encoder 
It provides a good sentence to sentence relevance score when you have two sentences. The goes through a classifier after the pooler layer and it will provide a relevance score. 

In [69]:
from sentence_transformers import CrossEncoder
cross_encoder_name = "cross-encoder/ms-marco-MiniLM-L12-v2"
cross_encoder = CrossEncoder(cross_encoder_name)

config.json:   0%|          | 0.00/791 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/133M [00:00<?, ?B/s]

tokenizer_config.json: 0.00B [00:00, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/132 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

In [100]:
question = "what is the best exercise for legs?"

answers = [
    "The best exercise is squat.",
    "I love food.",
    "Deadlift is the best exercise."
]

scores = cross_encoder.predict(list(zip([question]*len(answers), answers)))

answers[torch.argmax(torch.tensor(scores)).item()]

'Deadlift is the best exercise.'