In [1]:
import torch
import gc
import transformers
import pandas as pd
from datasets import Dataset
from transformers import BertTokenizer, BertConfig
from transformers import BertForPreTraining, BertForMaskedLM

In [2]:
mujeres = pd.read_csv("gobernadoras_small.csv",usecols=['id','tweet'])

In [3]:
hombres = pd.read_csv("gobernadores_small.csv",usecols=['id','tweet'])

In [4]:
hombres.index

RangeIndex(start=0, stop=37962, step=1)

In [5]:
import numpy as np

np.random.seed(10)

remove_n = len(mujeres) - len(hombres)
drop_indices = np.random.choice(mujeres.index, remove_n, replace=False)
mujeres = mujeres.drop(drop_indices)

In [6]:
beto = 'dccuchile/bert-base-spanish-wwm-cased'

tokenizer = BertTokenizer.from_pretrained(beto, 
                        return_offset_mapping=True)
model = BertForMaskedLM.from_pretrained(beto, return_dict=True)



In [7]:
len(hombres),len(mujeres)

(37962, 37962)

In [8]:
tokenizer

PreTrainedTokenizer(name_or_path='dccuchile/bert-base-spanish-wwm-cased', vocab_size=31002, model_max_len=512, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'})

In [9]:
def preprocess_function(examples):
    inputs = [ex for ex in examples["mujeres_tweet"]]
    targets = [ex for ex in examples["hombres_tweet"]]

    model_inputs = tokenizer(inputs,
                        text_pair=targets,
                        padding="max_length", 
                        truncation=True,
                        max_length=280,
                        return_tensors='pt'
                        )
    return model_inputs

In [10]:
def tokenize_sentences(hombres, mujeres):
    data = {"mujeres_tweet":list(mujeres.tweet),
        "hombres_tweet":list(hombres.tweet)}
    raw_dataset = pd.DataFrame(data)

    ds = Dataset.from_pandas(raw_dataset,preserve_index=True)
    ds = ds.rename_column('__index_level_0__','id')
    ds = ds.train_test_split(shuffle = True, seed = 200, test_size=0.2)

    train_set = preprocess_function(ds["train"])
    test_set = preprocess_function(ds["test"])
    
    train_set['labels'] = train_set['input_ids'].detach().clone()
    test_set['labels'] = test_set['input_ids'].detach().clone()
    
    return train_set, test_set

In [11]:
trainingSet, testSet= tokenize_sentences(hombres, mujeres)

In [12]:
#random_tensor = torch.rand(trainingSet['input_ids'].shape)
#masked_tensor = (random_tensor < 0.15)*(trainingSet['input_ids'] != 101)*(trainingSet['input_ids'] != 102)*(trainingSet['input_ids'] != 0)

In [13]:
# getting all those indices from each row which are set to True, i.e. masked.
#nonzeros_indices = []
#for i in range(len(masked_tensor)):
    #nonzeros_indices.append(torch.flatten(masked_tensor[i].nonzero()).tolist())

In [14]:
# setting the values at those indices to be a MASK token (103) for every row in the original input_ids.
#for i in range(len(trainingSet['input_ids'])):
#    trainingSet['input_ids'][i, nonzeros_indices[i]] = 103

In [15]:
input_ids = trainingSet["input_ids"]
input_ids[0][7] = tokenizer.mask_token_id

labels = trainingSet["input_ids"].clone()
labels[labels!= tokenizer.mask_token_id] = -100

In [16]:
class TweetsDataset(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        input_ids = self.encodings['input_ids'][index]
        labels = self.encodings['labels'][index]
        attention_mask = self.encodings['attention_mask'][index]
        token_type_ids = self.encodings['token_type_ids'][index]
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

In [17]:
dataset = TweetsDataset(trainingSet)

In [18]:
input_ids = testSet["input_ids"]
input_ids[0][7] = tokenizer.mask_token_id

labels = testSet["input_ids"].clone()
labels[labels!= tokenizer.mask_token_id] = -100

In [19]:
class TweetsDataset2(torch.utils.data.Dataset):
    def __init__(self, encodings):
        self.encodings = encodings
    
    def __len__(self):
        return len(self.encodings['input_ids'])

    def __getitem__(self, index):
        input_ids = self.encodings['input_ids'][index]
        labels = self.encodings['labels'][index]
        attention_mask = self.encodings['attention_mask'][index]
        token_type_ids = self.encodings['token_type_ids'][index]
        return {
            'input_ids': input_ids,
            'labels': labels,
            'attention_mask': attention_mask,
            'token_type_ids': token_type_ids
        }

In [20]:
test_dataset = TweetsDataset2(testSet)

In [21]:
def get_dataloader(dataset):
    dataloader = torch.utils.data.DataLoader(
      dataset,
      batch_size=32,
      shuffle=True,
    num_workers=5
    )
    return dataloader

def get_Testdataloader(test_dataset):
    dataloader = torch.utils.data.DataLoader(
      test_dataset,
      batch_size=16,
      shuffle=True,
    num_workers=5
    )
    return dataloader

In [22]:
Data_loader_train = get_dataloader(dataset)
Data_loader_test = get_Testdataloader(test_dataset)

In [23]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

device(type='cuda')

In [24]:
model.to(device)

BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(31002, 768, padding_idx=1)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=Tr

In [25]:
from tqdm.auto import tqdm
from transformers import AdamW
from transformers import DataCollatorForLanguageModeling

In [26]:
def corr(x,y): return np.corrcoef(x,y)[0][1]
def corr_d(eval_pred): return {'pearson': corr(*eval_pred)}

In [27]:
from transformers import TrainingArguments, Trainer

In [28]:
epochs = 2
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5)

In [29]:
gc.collect()

0

In [30]:
torch.cuda.empty_cache()

In [31]:
def compute_metrics(pred):
    corr = torch.corrcoef(pred)
    return corr.item()

In [32]:
def training(Data_loader_train):
    model.train()
    
    loop = tqdm(Data_loader_train)
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        outputs = model(input_ids=input_ids,
                        labels=labels,
                        attention_mask = attention_mask,
                        token_type_ids = token_type_ids)
        loss = outputs.loss
        x = outputs.logits
        corr = compute_metrics(x.view(-1))
        loss.backward()
        optimizer.step()

        loop.set_description("Epoch: {}".format(epoch))
        loop.set_postfix(loss=loss.item(), corr = corr)
    return outputs

def test(Data_loader_test):
    model.eval()
    loop = tqdm(Data_loader_test)
    
    for batch in loop:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        labels = batch['labels'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        token_type_ids = batch["token_type_ids"].to(device)
        outputs = model(input_ids=input_ids,
                        labels=labels,
                        attention_mask = attention_mask,
                        token_type_ids = token_type_ids)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        x = outputs.logits
        corr = compute_metrics(x.view(-1))

        loop.set_description("Epoch: {}".format(epoch))
        loop.set_postfix(loss=loss.item(), corr=corr)
    return outputs

In [33]:
for epoch in range(epochs):
    gc.collect()
    torch.cuda.empty_cache()
    
    print("Training BETO")
    training(Data_loader_train)
    
    print("Collect gc")
    gc.collect()
    print("Empty cache")
    torch.cuda.empty_cache()
    print("Test phase")
    
    test(Data_loader_test)

Training BETO


  0%|          | 0/950 [00:00<?, ?it/s]

Collect gc
Empty cache
Test phase


  0%|          | 0/475 [00:00<?, ?it/s]

Training BETO


  0%|          | 0/950 [00:00<?, ?it/s]

Collect gc
Empty cache
Test phase


  0%|          | 0/475 [00:00<?, ?it/s]

In [34]:
model.save_pretrained('./my_model_checkpoints/')

In [35]:
tokenizer.save_pretrained('./my_model_checkpoints/')

('./my_model_checkpoints/tokenizer_config.json',
 './my_model_checkpoints/special_tokens_map.json',
 './my_model_checkpoints/vocab.txt',
 './my_model_checkpoints/added_tokens.json')

In [52]:
mujeres.tweet.to_numpy()[0]

'@MarinadelPilar ¿estrategia?, puro hacer como que trabajan, ya parece usted la "Sheimbaum Cachanilla".'

In [59]:
lista = []
for i in mujeres.tweet.to_numpy():
    lista.append(i)
    
lista

['@MarinadelPilar ¿estrategia?, puro hacer como que trabajan, ya parece usted la "Sheimbaum Cachanilla".',
 '@MarinadelPilar  https://t.co/6mKV4rw4Wh',
 '@MarinadelPilar Y lo que venga',
 '@MarinadelPilar Y en delincuencia/crimen organizado, también … a nivel mundial!!!',
 '@MarinadelPilar “No les vamos a fallar” dijo la ‘gobernadora’ de una de las entidades más inseguras del mundo 🤡 aplica la ley, inepta',
 '@MarinadelPilar dándole prioridad a un parque mientras la inseguridad de la baja va en aumento, al putazo tus prioridades, para de mamar y ponte a trabajar, deja de pasearte a lo pndjo !!!',
 '@MarinadelPilar Prioridad deberías darle a la seguridad ciudadana para que podamos salir a las calles sin temor alguno, deja tus parques para después no mms!!',
 '@MarinadelPilar @M_OlgaSCordero ¡Feliz Cumpleaños! Para la ex ministra y senadora @M_OlgaSCordero  🎉🎊 ¡Que tengas un excelente día!',
 '@MarinadelPilar @ManuelBartlett @CFEmx ¿Alcanzarán a concretar estos proyectos  ahora que detuv

In [60]:
def encode(df):
    data = df.tweet.to_numpy()
    print("Encoding the corpus. This might take a while")
    embeddings = []
    for sentence in data:
        corpus_embeddings = tokenizer.encode(sentence)
        corpus_embeddings = corpus_embeddings / np.linalg.norm(corpus_embeddings,keepdims=True)
        embeddings.append(corpus_embeddings)
        
    return embeddings

In [61]:
mujeres_embeddings = encode(mujeres)

Encoding the corpus. This might take a while


In [65]:
hombres_embeddings = encode(hombres)

Encoding the corpus. This might take a while


In [68]:
datos_encoded = pd.DataFrame([mujeres_embeddings,hombres_embeddings]).T
datos_encoded

Unnamed: 0,0,1
0,"[9.200662022541436e-05, 0.022265602094550276, ...","[7.827947739841682e-05, 0.01894363353041687, 0..."
1,"[4.781730206567746e-05, 0.011571787099893946, ...","[2.8872282299460586e-05, 0.006987092316469462,..."
2,"[0.00011344798253902856, 0.027454411774444912,...","[2.949848419911238e-05, 0.007138633176185196, ..."
3,"[0.00010210152798898534, 0.02470856977333445, ...","[7.657220605912408e-05, 0.018530473866308027, ..."
4,"[6.582413644159152e-05, 0.015929441018865148, ...","[6.370191282126117e-05, 0.015415862902745204, ..."
...,...,...
37957,"[3.7389289487638875e-05, 0.009048208056008608,...","[4.8649710248037387e-05, 0.011773229880025047,..."
37958,"[4.1000463572505524e-05, 0.009922112184546337,...","[7.455806198140986e-05, 0.018043050999501188, ..."
37959,"[3.6477210693255496e-05, 0.008827484987767831,...","[6.878546554227372e-05, 0.016646082661230237, ..."
37960,"[2.8663598269438686e-05, 0.006936590781204162,...","[6.851304842519479e-05, 0.01658015771889714, 0..."


In [73]:
datos_encoded.columns = ["Mujeres", "Hombres"]

In [75]:
datos_encoded.to_csv("embeddings.csv")