In [1]:
import sys
sys.path.insert(1, '../../libs')
from utils import get_data, temporal_train_test_split

from transformers import AutoTokenizer  # Or BertTokenizer
from transformers import AutoModelForPreTraining  # Or BertForPreTraining for loading pretraining heads
from transformers import AutoModel  # or BertModel, for BERT without pretraining headsimport pandas as pd
from transformers import BertTokenizer, BertModel
import numpy as np
import pandas as pd
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
tokenizer = AutoTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased', do_lower_case=False)
model = BertModel.from_pretrained('neuralmind/bert-base-portuguese-cased', output_hidden_states = True)

Some weights of the model checkpoint at neuralmind/bert-base-portuguese-cased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [3]:
"""
ref: https://towardsdatascience.com/3-types-of-contextualized-word-embeddings-from-bert-using-transfer-learning-81fcefe3fe6d
"""
def bert_text_preparation(text, tokenizer):
    marked_text = "[CLS] " + text + " [SEP]"
    tokenized_text = tokenizer.tokenize(marked_text)
    indexed_tokens = tokenizer.convert_tokens_to_ids(tokenized_text)
    segments_ids = [1]*len(indexed_tokens)

    tokens_tensor = torch.tensor([indexed_tokens])
    segments_tensors = torch.tensor([segments_ids])

    return tokenized_text, tokens_tensor, segments_tensors

In [4]:
def get_bert_embeddings(tokens_tensor, segments_tensors, model):
    with torch.no_grad():
        outputs = model(tokens_tensor, segments_tensors)
        hidden_states = outputs[2][1:]

    token_embeddings = hidden_states[-1]
    token_embeddings = torch.squeeze(token_embeddings, dim=0)
    list_token_embeddings = [token_embed.tolist() for token_embed in token_embeddings]

    return list_token_embeddings

In [5]:
data = get_data("../../data/authors.csv")
data = data[data.comment.str.len() <= 510]

In [7]:
emb_list = list()

for i, row in data.iterrows():
    username, comment, created_utc = row
    tokenized_text, tokens_tensor, segments_tensors = bert_text_preparation(comment, tokenizer)
    bert_emb = np.array(get_bert_embeddings(tokens_tensor, segments_tensors, model)).mean(axis=0)
    bert_emb = np.concatenate([row, bert_emb])
    emb_list.append(bert_emb)

In [8]:
columns = np.concatenate([data.columns, [f"emb_{i}" for i in range(1, 769)]])

In [10]:
df_bert = pd.DataFrame(emb_list, columns=columns)

In [11]:
df_bert.to_csv("../../data/authors_bert.csv")