# Tema 5: Embeddings contextuales

In [None]:
from sklearn.metrics.pairwise import cosine_similarity

In [None]:
query = "Dogs are domestic animals."

sentences = [
    "Dogs are pets.",
    "This is a dog.",
    "They are free today."
]

## Ejercicio 1
Obtener embeddings contextuales con BERT y calcular similitud mediante estrategias mean y max pooling.

### Apartado a
Cargar modelo y tokenizer de BERT.

In [None]:
import torch
import numpy as np
from transformers import BertModel, BertTokenizer


tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

### Apartado b
Tokenizar la query de ejemplo.

In [None]:
tokens = tokenizer(query, padding=True, truncation=True, return_tensors="pt")

for token_id in tokens['input_ids']:
    print(tokenizer.convert_ids_to_tokens(token_id))

### Apartado c
Definir funciones para obtener embeddings con mean y max pooling.

In [None]:
def get_bert_embeddings(text):
    tokens = tokenizer(text, padding=True, truncation=True, return_tensors="pt")
    with torch.no_grad():
        outputs = model(**tokens)
    # obtener embeddings de la ultima capa (last_hidden_state) y
    # eliminar [CLS] (primer token) y [SEP] (ultimo token) para frases simples
    return outputs.last_hidden_state.numpy()[:, 1:-1, :]


def get_bert_mean_vector(text):
    embeddings = get_bert_embeddings(text)
    return np.mean(embeddings[0], axis=0)


def get_bert_max_vector(text):
    embeddings = get_bert_embeddings(text)
    return np.max(embeddings[0], axis=0)

### Apartado d
Calcular similitud entre la query y las frases.

In [None]:
query_mean_vector = get_bert_mean_vector(query)
query_max_vector = get_bert_max_vector(query)

for sent in sentences:
    sent_mean_vector = get_bert_mean_vector(sent)
    sent_max_vector = get_bert_max_vector(sent)

    print('-------------------------------------------------------------------')
    print(f'Query: "{query}" - Sentence: "{sent}"')
    print('-------------------------------------------------------------------')
    print("Mean:", cosine_similarity([query_mean_vector], [sent_mean_vector]))
    print("Max:", cosine_similarity([query_max_vector], [sent_max_vector]))
    print()

## Ejercicio 2
Obtener embeddings con Sentence-BERT (SBERT) y comparar modelos MiniLM y MPNet.

In [None]:
from sentence_transformers import SentenceTransformer

model_mini = SentenceTransformer('all-MiniLM-L6-v2')
model_mpnet = SentenceTransformer('all-mpnet-base-v2')

query = "Dogs are domestic animals."
sentences = [
    "Dogs are pets.",
    "This is a dog.",
    "They are free today."
]

query_vector_mini = model_mini.encode(query)
query_vector_mpnet = model_mpnet.encode(query)

for sent in sentences:
    sent_vector_mini = model_mini.encode(sent)
    sent_vector_mpnet = model_mpnet.encode(sent)

    print('-------------------------------------------------------------------')
    print(f'Query: "{query}" - Sentence: "{sent}"')
    print('-------------------------------------------------------------------')
    print("minilm:", cosine_similarity([query_vector_mini], [sent_vector_mini]))
    print("mpnet:", cosine_similarity([query_vector_mpnet], [sent_vector_mpnet]))
    print()

## Ejercicio 3
Comparar embeddings de Word2Vec, BERT y SBERT para las mismas frases.

### Apartado a
Cargar modelo Word2Vec y definir función para obtener vectores de frase.

In [None]:
import gensim.downloader as api

w2v_model = api.load('word2vec-google-news-300')


def get_w2v_sentence_vector(text, model):
    words = text.lower().replace('.', '').split()
    word_vectors = [model[w] for w in words if w in model]
    if not word_vectors:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)

### Apartado b
Comparar similitud con los tres métodos.

In [None]:
query = "Dogs are domestic animals."
sentences = [
    "Dogs are pets.",
    "This is a dog.",
    "They are free today."
]

query_w2v = get_w2v_sentence_vector(query, w2v_model)
query_bert = get_bert_mean_vector(query)
query_sbert = model_mini.encode(query)

for sent in sentences:
    sent_w2v = get_w2v_sentence_vector(sent, w2v_model)
    sent_bert = get_bert_mean_vector(sent)
    sent_sbert = model_mini.encode(sent)

    print('-------------------------------------------------------------------')
    print(f'Query: "{query}" - Sentence: "{sent}"')
    print('-------------------------------------------------------------------')
    print("Word2Vec:", cosine_similarity([query_w2v], [sent_w2v]))
    print("BERT (mean):", cosine_similarity([query_bert], [sent_bert]))
    print("SBERT (MiniLM):", cosine_similarity([query_sbert], [sent_sbert]))
    print()

## Ejercicio 4
Medir la similitud de diferentes frases utilizando embeddings diferentes.

In [None]:
def compare_embeddings(query, sentences):
    query_bert = get_bert_mean_vector(query)
    query_sbert = model_mini.encode(query)

    for sent in sentences:
        sent_bert = get_bert_mean_vector(sent)
        sent_sbert = model_mini.encode(sent)

        print('-------------------------------------------------------------------')
        print(f'Query: "{query}" - Sentence: "{sent}"')
        print('-------------------------------------------------------------------')
        print("BERT (mean):", cosine_similarity([query_bert], [sent_bert]))
        print("SBERT (MiniLM):", cosine_similarity([query_sbert], [sent_sbert]))
        print()

### Apartado a
Frases que expresan la misma idea, pero tienen diferente orden.

In [None]:
query = "Don't shout at me, John."

sentences = [
    "Don't shout at John.",
    "John, stop shouting at me."
]

compare_embeddings(query, sentences)

### Apartado b
Frases donde hay palabras que pueden tener más de un significado.

In [None]:
query = "The rolling Stones are rock idols."

sentences = [
    "Don't throw me a rock.",
    "Iggy Pop is my favourite artist."
]

compare_embeddings(query, sentences)

### Apartado c
Frases similares, pero que realmente no expresan lo mismo.

In [None]:
query = "I love the capital of Spain."

sentences = [
    "I like Madrid.",
    "I love the capital of Portugal.",
    "I love the capital of Japan.",
    "I hate the capital of Spain.",
    "I hate Japan"
]

compare_embeddings(query, sentences)