In [None]:
from transformers import AutoModelForMaskedLM

model_checkpoint = "neuralmind/bert-base-portuguese-cased"
model = AutoModelForMaskedLM.from_pretrained(model_checkpoint)

In [None]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(model_checkpoint)

In [None]:
data_path="/home/allan_m_ufms_br/clean_tweets.csv"

In [None]:
from datasets import load_dataset

tweet_dataset = load_dataset("csv", data_files=data_path)

In [None]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import RSLPStemmer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('rslp')

def clean_tweet(tweet):
    # remove links
    tweet = re.sub(r'http(\S)+', '', tweet)
    # remove pontuação
    tweet = re.sub(r'[^\w\s]', '', tweet)
    # converte para minúsculas
    tweet = tweet.lower()
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        "]+", flags=re.UNICODE)
    tweet = emoji_pattern.sub(r'', tweet)
    # remove stop words em português
    stop_words = set(stopwords.words('portuguese'))
    words = nltk.word_tokenize(tweet)
    words = [word for word in words if not word in stop_words]
    # aplica stemização
    stemmer = RSLPStemmer()
    words = [stemmer.stem(word) for word in words]
    # junta as palavras novamente
    tweet = ' '.join(words)
    return tweet

In [None]:
def clean_dataset(dataset):
    dataset['Text'] = clean_tweet(str(dataset['Text']))
    return dataset

In [None]:
def tokenize_function(examples):
    result = tokenizer(examples["Text"])
    if tokenizer.is_fast:
        result["word_ids"] = [result.word_ids(i) for i in range(len(result["input_ids"]))]
    return result

In [None]:
tokenized_datasets = tweet_dataset.map(
    tokenize_function, batched=True, remove_columns=["Datetime", "Likes", "Retweets", "Text"]
)


In [None]:
chunk_size = 128

In [None]:
def group_texts(examples):
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    total_length = (total_length // chunk_size) * chunk_size
    result = {
        k: [t[i : i + chunk_size] for i in range(0, total_length, chunk_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [None]:
lm_datasets = tokenized_datasets.map(group_texts, batched=True)

In [None]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [None]:
train_size = 161_217
test_size = int(0.1 * train_size)

downsampled_dataset = lm_datasets["train"].train_test_split(
    train_size=train_size, test_size=test_size, seed=42
)
downsampled_dataset

In [None]:
downsampled_dataset_test = lm_datasets["train"].train_test_split(
    train_size=int(test_size*0.8), test_size=int(test_size*0.2), seed=42
)
downsampled_dataset_test

In [None]:
batch_size = 32

In [None]:
from transformers import TrainingArguments

# Show the training loss with every epoch
logging_steps = len(downsampled_dataset["train"]) // batch_size
model_name = model_checkpoint.split("/")[-1]

import torch
print(torch.cuda.is_available())

training_args = TrainingArguments(
    output_dir=f"{model_name}-finetuned-imdb",
    overwrite_output_dir=True,
    evaluation_strategy="epoch",
    learning_rate=2e-5,
    weight_decay=0.01,
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
    push_to_hub=False,
    fp16=True,
    logging_steps="epoch",
)

In [None]:
from transformers import Trainer

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=downsampled_dataset["train"],
    eval_dataset=downsampled_dataset["test"],
    data_collator=data_collator,
    tokenizer=tokenizer,
)

In [None]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
model.to(device)

In [None]:
import math

eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
trainer.train()

In [None]:
eval_results = trainer.evaluate()
print(f">>> Perplexity: {math.exp(eval_results['eval_loss']):.2f}")

In [None]:
torch.save(model.state_dict(), "./pytorch_tweet_finetuned.pth")