In [None]:
import re
import torch
import sklearn
import datasets
import transformers
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertForSequenceClassification
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from datasets import Dataset, load_dataset
import warnings
warnings.filterwarnings('ignore')

In [None]:
np.random.seed(42)
torch.manual_seed(42)

<torch._C.Generator at 0x7812d67e3e70>

In [None]:
pd.set_option('max_colwidth', None)

In [None]:
%reload_ext watermark
%watermark -a "Pedro Marcello"

Author: Pedro Marcello



In [None]:
ds = load_dataset("mattymchen/mr")

README.md:   0%|          | 0.00/688 [00:00<?, ?B/s]

(…)-00000-of-00001-1ad570418120a677.parquet:   0%|          | 0.00/884k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/10662 [00:00<?, ? examples/s]

In [None]:
df_inteiro = pd.DataFrame(ds["test"])

In [None]:
df = df_inteiro.iloc[:len(df_inteiro)//4]

In [None]:
df.shape

(2665, 2)

In [None]:
df.head()

Unnamed: 0,text,label
0,"with its dogged hollywood naturalism and the inexorable passage of its characters toward sainthood , windtalkers is nothing but a sticky-sweet soap .",0
1,". . . has its moments , but ultimately , its curmudgeon does n't quite make the cut of being placed on any list of favorites .",0
2,leigh 's film is full of memorable performances from top to bottom .,1
3,the code talkers deserved better than a hollow tribute .,0
4,i have to admit that i am baffled by jason x .,0


In [None]:
df.label.value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
0,1346
1,1319


In [None]:
def limpar_texto(texto):
    texto = re.sub(r'http\S+|#\w+|@\w+|\d+|<.*?>|[^\w\s]|[''\".,]', '', texto)
    texto = re.sub(r'\s+', ' ', texto)
    texto = texto.strip()
    texto = texto.lower()
    return texto

In [None]:
df['text'] = df['text'].apply(limpar_texto)

In [None]:
df.head()

Unnamed: 0,text,label
0,with its dogged hollywood naturalism and the inexorable passage of its characters toward sainthood windtalkers is nothing but a stickysweet soap,0
1,has its moments but ultimately its curmudgeon does nt quite make the cut of being placed on any list of favorites,0
2,leigh s film is full of memorable performances from top to bottom,1
3,the code talkers deserved better than a hollow tribute,0
4,i have to admit that i am baffled by jason x,0


In [None]:
modelo = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels = 2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

In [None]:
def tokenizacao(examples):
    return tokenizer(
        examples["text"],
        padding="max_length",
        truncation=True,
    )

In [None]:
dataset = Dataset.from_pandas(df[['text', 'label']])

In [None]:
tokenized_dataset = dataset.map(tokenizacao, batched = True, batch_size=64, num_proc=2)

Map (num_proc=2):   0%|          | 0/2665 [00:00<?, ? examples/s]

In [None]:
train_test_split_ratio = 0.8

In [None]:
split = tokenized_dataset.train_test_split(test_size = 1 - train_test_split_ratio)

In [None]:
dataset_treino = split['train']
dataset_teste = split['test']

In [None]:
def calcula_metricas(p):
    preds = np.argmax(p.predictions, axis = 1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average = 'weighted')
    acc = accuracy_score(p.label_ids, preds)
    return {'accuracy': acc, 'f1': f1, 'precision': precision, 'recall': recall}

In [None]:
# Define training arguments
training_args = TrainingArguments(output_dir = './resultados',
                                  eval_strategy = 'epoch',
                                  learning_rate = 2e-5,
                                  per_device_train_batch_size = 36,
                                  per_device_eval_batch_size = 36,
                                  num_train_epochs = 4,
                                  weight_decay = 0.01,
                                  gradient_accumulation_steps = 6,
                                  logging_steps = 2)

In [None]:
data_collator = DataCollatorWithPadding(tokenizer = tokenizer)

In [None]:
treina_modelo = Trainer(model = modelo,
                            args = training_args,
                            train_dataset = dataset_treino,
                            eval_dataset = dataset_teste,
                            tokenizer = tokenizer,
                            data_collator = data_collator,
                            compute_metrics = calcula_metricas)

In [None]:
gc.collect()
torch.cuda.empty_cache()

In [None]:
treina_modelo.train()

[34m[1mwandb[0m: Currently logged in as: [33mpedromarcello778[0m ([33mpedromarcello778-[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


Epoch,Training Loss,Validation Loss,Accuracy,F1,Precision,Recall
1,0.6448,0.608621,0.731707,0.731333,0.736604,0.731707
2,0.5283,0.50881,0.789869,0.789724,0.793956,0.789869
3,0.4489,0.455906,0.80863,0.807801,0.810874,0.80863
4,0.3791,0.437482,0.825516,0.825576,0.825921,0.825516


TrainOutput(global_step=40, training_loss=0.5241212487220764, metrics={'train_runtime': 818.1424, 'train_samples_per_second': 10.424, 'train_steps_per_second': 0.049, 'total_flos': 2243811080110080.0, 'train_loss': 0.5241212487220764, 'epoch': 4.0})

In [None]:
eval_result = treina_modelo.evaluate()
print(eval_result)

{'eval_loss': 0.43748247623443604, 'eval_accuracy': 0.8255159474671669, 'eval_f1': 0.8255761794459344, 'eval_precision': 0.825921170101825, 'eval_recall': 0.8255159474671669, 'eval_runtime': 16.397, 'eval_samples_per_second': 32.506, 'eval_steps_per_second': 0.915, 'epoch': 4.0}


In [None]:
modelo.save_pretrained('./modelo_final')
tokenizer.save_pretrained('./modelo_final')

('./modelo_final/tokenizer_config.json',
 './modelo_final/special_tokens_map.json',
 './modelo_final/vocab.txt',
 './modelo_final/added_tokens.json')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
%watermark -a "Pedro Marcello"

Author: Pedro Marcello



In [None]:
%watermark -v -m

Python implementation: CPython
Python version       : 3.11.11
IPython version      : 7.34.0

Compiler    : GCC 11.4.0
OS          : Linux
Release     : 6.1.85+
Machine     : x86_64
Processor   : x86_64
CPU cores   : 2
Architecture: 64bit



In [None]:
%watermark --iversions

numpy       : 1.26.4
pandas      : 2.2.2
transformers: 4.48.2
datasets    : 3.2.0
sklearn     : 1.6.1
re          : 2.2.1
torch       : 2.5.1+cu124

