In [13]:
import re
from collections import defaultdict
import nltk
from nltk.corpus import stopwords
import torch
from transformers import AutoTokenizer, AutoModel
from tqdm import tqdm
import numpy as np
import os
import pandas as pd

### Load dataset and count words

In [14]:
dataset_path = "../1-dataset/VUAMC_sentences_labeled.csv"
data = pd.read_csv(dataset_path)
sentences = data["sentence"].tolist()
metaphors = data["metaphors"].tolist()
metaphors = [set(m.strip().lower() for m in mlist.split(";")) if isinstance(mlist, str) else set() for mlist in metaphors]

metaphor_counts = defaultdict(int)
literal_counts = defaultdict(int)

for sentence, sentence_metaphors in zip(sentences, metaphors):
    words = re.findall(r"\w+", sentence.lower())
    for word in words:
        if word in sentence_metaphors:
            metaphor_counts[word] += 1
        else:
            literal_counts[word] += 1

### Filter words

In [15]:
top_words = set(word for word in metaphor_counts if metaphor_counts[word] >= 40 and literal_counts[word] >= 40)

nltk.download("stopwords")
stopwords_set = set(stopwords.words("english"))
top_words -= stopwords_set

print(f"Selected words: {top_words}")

Selected words: {'given', 'long', 'make', 'go', 'take', 'see', 'bit', 'give', 'found', 'put', 'way', 'back', 'high', 'get', 'next', 'came', 'got', 'come'}


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mattia/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [16]:
# To find the word's token (or tokens)
def get_token_indices(word, all_tokens, tokenizer):
    word_tokens = tokenizer.tokenize(word) # Tokenize the word to handle sub-tokens
    for i in range(len(all_tokens) - len(word_tokens) + 1):
        if all_tokens[i:i+len(word_tokens)] == word_tokens:
            return list(range(i, i+len(word_tokens)))
    return []

In [17]:
def embeddings(model_name):
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModel.from_pretrained(model_name)
    model.eval()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    top_words_list = []
    embeddings_list = []
    labels_list = []

    for sentence, sentence_metaphors in tqdm(zip(sentences, metaphors), total=len(sentences), desc="Encoding"):
        words = set(re.findall(r"\w+", sentence.lower()))
        if words & top_words:
            encoded = tokenizer(sentence, return_tensors="pt", truncation=True)
            with torch.no_grad():
                outputs = model(**encoded.to(device))
                hidden_states = outputs.last_hidden_state.squeeze(0) # [n_tokens, hidden_dim]

            all_tokens = tokenizer.convert_ids_to_tokens(encoded["input_ids"].squeeze(0))

            for word in top_words:
                token_idxs = get_token_indices(word, all_tokens, tokenizer)
                if token_idxs:
                    embedding = hidden_states[token_idxs].mean(dim=0).cpu() # Sub-token embeddings mean
                    label = 1 if word in sentence_metaphors else 0
                    top_words_list.append(word)
                    embeddings_list.append(embedding.numpy())
                    labels_list.append(label)

    out_path = "top_words_embeddings.npz"
    np.savez(out_path, words=top_words_list, embeddings=embeddings_list, labels=labels_list)

In [18]:
model_name = "bert-base-uncased"
embeddings(model_name)

Encoding: 100%|██████████| 16202/16202 [02:19<00:00, 116.25it/s]
