In [1]:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader, SubsetRandomSampler
import torch.optim as optim
from transformers import AutoTokenizer, AutoModelForSequenceClassification, get_linear_schedule_with_warmup #hugging face
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_recall_fscore_support
from tqdm import tqdm
from datetime import datetime
import random

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# 1. Definir o Dataset customizado para treino/validação
class TextDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128, is_train=True):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        self.is_train = is_train

    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        text = self.dataframe.loc[index, 'ds_bem_candidato']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        item = {
            'input_ids': encoding['input_ids'].squeeze(),  # remove dimensao extra
            'attention_mask': encoding['attention_mask'].squeeze()
        }
        if self.is_train:
            label = self.dataframe.loc[index, 'y']
            item['labels'] = torch.tensor(label, dtype=torch.long)
        return item

# Dataset para aplicação (sem labels)
class AppDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len=128):
        self.dataframe = dataframe.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __len__(self):
        return len(self.dataframe)
    
    def __getitem__(self, index):
        text = self.dataframe.loc[index, 'ds_bem_candidato']
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_len,
            truncation=True,
            padding='max_length',
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(),
            'attention_mask': encoding['attention_mask'].squeeze()
        }

In [3]:
# 2. Carregar os dados e dividir em treino e validação
file_parquet = 'bases/bd01_benscand_treino.parquet'
df = pd.read_parquet(file_parquet)
#df = df.sample(n=1000, random_state=42)
df['y'] = df['y'].astype(int)

train_df, val_df = train_test_split(df, test_size=0.2, random_state=42)

In [4]:
# 3. Carregar o tokenizer e o modelo BERT para português
model_name = "neuralmind/bert-large-portuguese-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=5)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-large-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [5]:
# 4. Configurar o dispositivo (GPU com CUDA, se disponível)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
device

device(type='cuda')

In [10]:
# 5. Criar os datasets e dataloaders
max_len = 128 # número de tokens que ele vai transformar cada texto
batch_size = 16
sampler_size = 16000

train_dataset = TextDataset(train_df, tokenizer, max_len=max_len, is_train=True)
val_dataset = TextDataset(val_df, tokenizer, max_len=max_len, is_train=True)

indices = random.sample(range(len(train_dataset)), sampler_size)
sampler = SubsetRandomSampler(indices)
    
# Cria um DataLoader com o novo sampler para essa época
train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)

#train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size)

In [11]:
# 6. Configurar o otimizador e scheduler
epochs = 100
optimizer = optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
total_steps = len(train_loader) * epochs
scheduler = get_linear_schedule_with_warmup(optimizer,
                                            num_warmup_steps=int(0.1 * total_steps),
                                            num_training_steps=total_steps)

In [12]:
print(datetime.now())

patience = 3
best_val_loss = float('inf')
patience_counter = 0

# 7. Loop de treinamento
for epoch in range(epochs):

    # Reamostrar aleatoriamente X mil índices do dataset completo a cada época
    indices = random.sample(range(len(train_dataset)), sampler_size)
    sampler = SubsetRandomSampler(indices)
    
    # Cria um DataLoader com o novo sampler para essa época
    train_loader = DataLoader(train_dataset, batch_size=batch_size, sampler=sampler)
    
    model.train()
    train_loss = 0.0
    progress_bar = tqdm(train_loader, desc=f"Epoch {epoch+1}/{epochs}")
    
    for batch in progress_bar:
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()
        scheduler.step()
        
        train_loss += loss.item()
        progress_bar.set_postfix(loss=f"{loss.item():.4f}")
    
    avg_train_loss = train_loss / len(train_loader)
    print(f"\nEpoch {epoch+1} - Treino Loss: {avg_train_loss:.4f}")
    
    # Avaliação no conjunto de validação
    model.eval()
    val_loss = 0.0
    preds = []
    true_labels = []
    with torch.no_grad():
        for batch in val_loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
            logits = outputs.logits
            preds.extend(torch.argmax(logits, dim=1).cpu().numpy())
            true_labels.extend(labels.cpu().numpy())
    
    avg_val_loss = val_loss / len(val_loader)
    acc = accuracy_score(true_labels, preds)
    precision, recall, f1, _ = precision_recall_fscore_support(true_labels, preds, average='weighted')
    print(f"Epoch {epoch+1} - Validação Loss: {avg_val_loss:.4f} | Acurácia: {acc:.4f} | Precision: {precision:.4f} | Recall: {recall:.4f} | F1: {f1:.4f}\n")

        # Early Stopping: verificar se a loss de validação melhorou
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        patience_counter = 0
    else:
        patience_counter += 1
        print(f"Sem melhora na loss de validação por {patience_counter} epoch(s).")
    
    if patience_counter >= patience:
        print(f"Early stopping: Não houve melhora na loss de validação por {patience} epochs consecutivas.")
        break

print(datetime.now())

2025-03-06 04:59:00.752730


Epoch 1/100: 100%|████████████████████████████████████████████████████| 1000/1000 [07:52<00:00,  2.12it/s, loss=0.0893]



Epoch 1 - Treino Loss: 0.8664
Epoch 1 - Validação Loss: 0.3031 | Acurácia: 0.9131 | Precision: 0.9021 | Recall: 0.9131 | F1: 0.8956



Epoch 2/100: 100%|████████████████████████████████████████████████████| 1000/1000 [07:52<00:00,  2.12it/s, loss=0.0630]



Epoch 2 - Treino Loss: 0.2684
Epoch 2 - Validação Loss: 0.2294 | Acurácia: 0.9351 | Precision: 0.9309 | Recall: 0.9351 | F1: 0.9320



Epoch 3/100: 100%|████████████████████████████████████████████████████| 1000/1000 [07:53<00:00,  2.11it/s, loss=0.0361]



Epoch 3 - Treino Loss: 0.2362
Epoch 3 - Validação Loss: 0.2193 | Acurácia: 0.9372 | Precision: 0.9349 | Recall: 0.9372 | F1: 0.9354



Epoch 4/100: 100%|████████████████████████████████████████████████████| 1000/1000 [07:53<00:00,  2.11it/s, loss=0.0386]



Epoch 4 - Treino Loss: 0.2180
Epoch 4 - Validação Loss: 0.2105 | Acurácia: 0.9413 | Precision: 0.9382 | Recall: 0.9413 | F1: 0.9379



Epoch 5/100: 100%|████████████████████████████████████████████████████| 1000/1000 [07:52<00:00,  2.11it/s, loss=0.1891]



Epoch 5 - Treino Loss: 0.2279


KeyboardInterrupt: 

In [88]:

# Salvar o modelo treinado e o tokenizer
model.save_pretrained("modelo/md02_bert_final")
tokenizer.save_pretrained("modelo/md02_bert_final")


('modelo/md02_bert_final\\tokenizer_config.json',
 'modelo/md02_bert_final\\special_tokens_map.json',
 'modelo/md02_bert_final\\vocab.txt',
 'modelo/md02_bert_final\\added_tokens.json',
 'modelo/md02_bert_final\\tokenizer.json')

In [66]:
# 8. Aplicar o modelo na base de dados de aplicação
# Supondo que 'dados_aplicacao.csv' contenha a coluna "X" (sem label)
app_df = pd.read_parquet("bases/bd01_benscand_apply.parquet")
#app_df = app_df.sample(n=5000, random_state=42)


In [67]:
app_dataset = AppDataset(app_df, tokenizer, max_len=max_len)
app_loader = DataLoader(app_dataset, batch_size=batch_size)

In [68]:
model.eval()
all_preds = []
with torch.no_grad():
    for batch in tqdm(app_loader, desc="Predizendo na base de aplicação"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = model(input_ids=input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)
        all_preds.extend(preds.cpu().numpy())

app_df['y'] = all_preds
app_df.to_parquet("output/base_final.parquet", index=False)
print("Predições salvas'")


Predizendo na base de aplicação: 100%|███████████████████████████████████████████████| 313/313 [00:13<00:00, 23.58it/s]


Predições salvas'
