# Tema 3: Embeddings Contextuales

In [1]:
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
query = "Dogs are domestic animals."

sentences = [
    "Dogs are pets.",
    "This is a dog.",
    "They are free today."
]


## Ejercicio 1: ELMo

In [3]:
import numpy as np
import tensorflow as tf
import tensorflow_hub as hub


elmo = hub.load("https://tfhub.dev/google/elmo/3")

In [4]:
def get_elmo_embeddings(text):
    return elmo.signatures['default'](tf.constant([text]))['elmo'].numpy()


def get_elmo_mean_vector(text):
    embeddings = get_elmo_embeddings(text)
    return np.mean(embeddings[0], axis=0)


def get_elmo_max_vector(text):
    embeddings = get_elmo_embeddings(text)
    return np.max(embeddings[0], axis=0)

In [5]:
query_mean_vector = get_elmo_mean_vector(query)
query_max_vector = get_elmo_max_vector(query)

for sent in sentences:
    sent_mean_vector = get_elmo_mean_vector(sent)
    sent_max_vector = get_elmo_max_vector(sent)

    print('-------------------------------------------------------------------')
    print(f'Query: "{query}" - Sentence: "{sent}"')
    print('-------------------------------------------------------------------')
    print("Mean:", cosine_similarity([query_mean_vector], [sent_mean_vector]))
    print("Max:", cosine_similarity([query_max_vector], [sent_max_vector]))
    print()

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "Dogs are pets."
-------------------------------------------------------------------
Mean: [[0.8564701]]
Max: [[0.8845436]]

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "This is a dog."
-------------------------------------------------------------------
Mean: [[0.5757736]]
Max: [[0.741337]]

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "They are free today."
-------------------------------------------------------------------
Mean: [[0.43642992]]
Max: [[0.682837]]



## Ejercicio 2: BERT

In [6]:
import torch
from transformers import BertModel, BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
def get_bert_embeddings(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
    # obtener embeddings de la ultima capa (last_hidden_state) y
    # eliminar [CLS] (primer token) y [SEP] (ultimo token) para frases simples
    return outputs.last_hidden_state.numpy()[:, 1:-1, :]


def get_bert_mean_vector(text):
    embeddings = get_bert_embeddings(text)
    return np.mean(embeddings[0], axis=0)


def get_bert_max_vector(text):
    embeddings = get_bert_embeddings(text)
    return np.max(embeddings[0], axis=0)

In [8]:
query_mean_vector = get_bert_mean_vector(query)
query_max_vector = get_bert_max_vector(query)

for sent in sentences:
    sent_mean_vector = get_bert_mean_vector(sent)
    sent_max_vector = get_bert_max_vector(sent)

    print('-------------------------------------------------------------------')
    print(f'Query: "{query}" - Sentence: "{sent}"')
    print('-------------------------------------------------------------------')
    print("Mean:", cosine_similarity([query_mean_vector], [sent_mean_vector]))
    print("Max:", cosine_similarity([query_max_vector], [sent_max_vector]))
    print()

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "Dogs are pets."
-------------------------------------------------------------------
Mean: [[0.8171347]]
Max: [[0.7983242]]

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "This is a dog."
-------------------------------------------------------------------
Mean: [[0.69774485]]
Max: [[0.81108034]]

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "They are free today."
-------------------------------------------------------------------
Mean: [[0.57056797]]
Max: [[0.75801915]]



## Ejercicio 3: SBERT

In [9]:
from sentence_transformers import SentenceTransformer


model_mini = SentenceTransformer('all-MiniLM-L6-v2')
model_mpnet = SentenceTransformer('all-mpnet-base-v2')

query = "Dogs are domestic animals."
sentences = [
    "Dogs are pets.",
    "This is a dog.",
    "They are free today."
]

query_vector_mini = model_mini.encode(query)
query_vector_mpnet = model_mpnet.encode(query)

for sent in sentences:
    sent_vector_mini = model_mini.encode(sent)
    sent_vector_mpnet = model_mpnet.encode(sent)

    print('-------------------------------------------------------------------')
    print(f'Query: "{query}" - Sentence: "{sent}"')
    print('-------------------------------------------------------------------')
    print("minilm:", cosine_similarity([query_vector_mini], [sent_vector_mini]))
    print("mpnet:", cosine_similarity([query_vector_mpnet], [sent_vector_mpnet]))
    print()

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "Dogs are pets."
-------------------------------------------------------------------
minilm: [[0.8485726]]
mpnet: [[0.8351238]]

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "This is a dog."
-------------------------------------------------------------------
minilm: [[0.55490136]]
mpnet: [[0.4824381]]

-------------------------------------------------------------------
Query: "Dogs are domestic animals." - Sentence: "They are free today."
-------------------------------------------------------------------
minilm: [[0.09792651]]
mpnet: [[0.06827898]]

