In [None]:
from google.colab import drive
drive.mount('/content/drive')
%cd /content/drive/MyDrive/MAFData/projetosAtivos/2303_posdoc_fgv/jobs/a30 - ML e Bens de Candidatos

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
/content/drive/MyDrive/MAFData/projetosAtivos/2303_posdoc_fgv/jobs/a30 - ML e Bens de Candidatos


In [None]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
from torch.amp import autocast, GradScaler
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup #hugging face
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from datetime import datetime
import random

In [None]:
# 1. Definir o Dataset customizado para treino/validação
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128, is_train=True):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_train = is_train

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        text = self.dataframe.loc[index, 'ds_bem_candidato_2']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),  # remove dimensao extra
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        if self.is_train:
            label = self.dataframe.loc[index, 'y']
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Dataset para aplicação (sem labels)
class AppDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, index):
        text = self.dataframe.loc[index, 'ds_bem_candidato_2']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [None]:
# 2. Carregar os dados e dividir em treino e validação
file_parquet = 'bases/bd01_benscand_treino.parquet'
df = pd.read_parquet(file_parquet)
df = df.sample(n=10000, random_state=42)
df['y'] = df['y'].astype(int)



In [None]:
# Split data

# Primeiro, divide em 75% treino e 25% em um conjunto temporário
train_df, temp_df = train_test_split(
    df, test_size=0.25, random_state=42, stratify=df['y']
)

# Em seguida, divide o conjunto temporário em 5% validação e 20% teste.
val_df, test_df = train_test_split(
    temp_df, test_size=0.8, random_state=42, stratify=temp_df['y']
)

In [None]:
# 3. Carregar o tokenizer e o modelo BERT para português
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)
#model = AutoModelForSequenceClassification.from_pretrained("modelo/colab/modelo_bert_treinado/")

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# 4. Configurar o dispositivo (GPU com CUDA, se disponível)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [None]:
# 5. Criar os datasets
max_len = 128 # número de tokens que ele vai transformar cada texto
batch_size = 50

# Criação dos datasets a partir dos dataframes
train_dataset = TextDataset(train_df, tokenizer, max_len=max_len, is_train=True)
val_dataset   = TextDataset(val_df, tokenizer, max_len=max_len, is_train=True)
test_dataset  = TextDataset(test_df, tokenizer, max_len=max_len, is_train=True)

# DataLoader para treino, val e test usando o sampler
train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers = 0)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers = 0)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers = 0)


In [None]:
# 6. Configurar o otimizador e scheduler
epochs = 50
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)

In [None]:
print(datetime.now())

patience = 3
best_val_loss = float('inf')
patience_counter = 0

scaler = GradScaler("cuda")

# 7. Loop de treinamento
for epoch in range(epochs):

    # Inicia o treinamento
    model.train()
    train_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")

    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

         # Executa a passagem forward em FP16 onde possível
        with autocast("cuda"):
          outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
          loss = outputs.loss

        # Escala o loss, faz o backward e atualiza os parâmetros
        scaler.scale(loss).backward()
        scaler.step(optimizer)
        scaler.update()
        scheduler.step()

        train_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")

    avg_train_loss = train_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} - Treino Loss: {avg_train_loss:.4f}")

    # Avaliação no conjunto de validação
    model.eval()
    val_loss = 0.0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())

    avg_val_loss = val_loss / len(val_loader)
    acc = accuracy_score(true_labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
    print(f"Epoch {epoch+1} - Validação Loss: {avg_val_loss:.4f} | Acurácia: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}\n")

    # Early Stopping: verificar se a loss de validação melhorou
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
        model.save_pretrained("modelo/colab/modelo_bert_treinado")
        print(f"Melhora na loss: {best_val_loss}")
    else:
        patience_counter += 1
        print(f"Sem melhora na loss de validação por {patience_counter} epoch(s).")

    if patience_counter >= patience:
        print(f"Early stopping: Não houve melhora na loss de validação por {patience} epochs consecutivas.")
        break

print(datetime.now())

2025-03-08 01:04:59.785957


Epoch 1/50:   7%|████                                                    | 11/150 [00:57<13:05,  5.65s/it, loss=1.6677]