# Preamble
___

In [52]:
import os
import pandas as pd
import numpy as np
from transformers import AutoTokenizer, AutoModel
import torch
from sklearn.metrics.pairwise import cosine_similarity
from pandarallel import pandarallel
import swifter
from tqdm import tqdm
from torch.utils.data import DataLoader, TensorDataset
from torch.nn.utils.rnn import pad_sequence
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
directorio = "/Users/nicolasdecamino/Documents/Investigacion/Paper Polities/Replicacion Jaccard"
os.chdir(directorio)

# BERT tokenization
___

In [10]:
# Load data:
df = pd.read_csv('Data/base-global/base-global.csv', sep=';')

In [4]:
# Load BERT model:
model_name = 'dccuchile/bert-base-spanish-wwm-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)

Some weights of BertModel were not initialized from the model checkpoint at dccuchile/bert-base-spanish-wwm-uncased and are newly initialized: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [53]:
# Get max length of the tokenized descriptions:
def get_length(description):
    encoded_description = tokenizer.encode(
        description,
        add_special_tokens=True,
    )
    return len(encoded_description)
lengths = df['lemmatized_description'].swifter.progress_bar(False).apply(get_length)
max_length = lengths.max()

In [54]:
# Tokenize lemmatized descriptions padding to max length:
input_ids, attention_masks = [], []
for description in df['lemmatized_description'].tolist():
    encoded_description = tokenizer.encode_plus(
        description,
        add_special_tokens=True,
        max_length=max_length,
        padding='max_length',
        return_attention_mask=True,
        return_tensors='pt'
    )
    input_ids.append(encoded_description['input_ids'])
    attention_masks.append(encoded_description['attention_mask'])

input_ids = torch.cat(input_ids, dim=0)
attention_masks = torch.cat(attention_masks, dim=0)

In [55]:
# Create DataLoader:
dataset = TensorDataset(input_ids, attention_masks)
data_loader = DataLoader(dataset, batch_size=16, shuffle=True)

In [47]:
# Run model on CPU, CUDA or MPS:
machine = 'mps'
device = torch.device(machine)
_ = model.to(device)

# If using colab, run model on TPU:
if machine == 'colab':
    import torch_xla
    import torch_xla.core.xla_model as xm
    device = xm.xla_device()
    _ = model.to(device)

In [None]:
model.eval()
embeds_CLSs, embeds_mean, embeds_max_over_time = [], [], []
for i, batch in enumerate(tqdm(data_loader)):
    b_input_ids = batch[0].to(device)
    b_attention_masks = batch[1].to(device)

    with torch.no_grad():
        
        outputs = model(b_input_ids, token_type_ids=None, attention_mask=b_attention_masks)[0]

        CLSs = outputs[:, 0, :]
        means = outputs.mean(dim=1)
        max_over_time = outputs.max(dim=1)

        embeds_CLSs.append(CLSs)
        embeds_mean.append(means)
        embeds_max_over_time.append(max_over_time)

# Simmilarity
___

In [None]:
# Función para calcular la similitud del coseno entre dos textos
def calculate_similarity(text1, text2):
    # Tokenizar y codificar los textos
    inputs1 = tokenizer(text1, return_tensors='pt', truncation=True, max_length=512)
    inputs2 = tokenizer(text2, return_tensors='pt', truncation=True, max_length=512)

    # Obtener los embeddings de los textos
    with torch.no_grad():
        embeddings1 = model(**inputs1).last_hidden_state.mean(dim=1)
        embeddings2 = model(**inputs2).last_hidden_state.mean(dim=1)

    # Calcular la similitud del coseno entre los embeddings
    similarity = cosine_similarity(embeddings1, embeddings2)

    return similarity

In [None]:
def get_positive_positions(v1, v2):
    positions = [i for i in range(len(df.embeddingCLS[0])) if v1[i] != 0.0 and v2[i] != 0.0]
    return positions

def _cosine_similarity(v1, v2):
    positions = get_positive_positions(v1, v2)
    if len(positions) == 0:
        return 0
    else:
      dot_product = np.dot(v1, v2)
      norm_vector1 = np.linalg.norm(v1)
      norm_vector2 = np.linalg.norm(v2)

      similarity = dot_product / (norm_vector1 * norm_vector2)
      return similarity