# Imports

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification
from torch.optim import AdamW
from tqdm import tqdm
import time

# DataFrame Load

Carregando Xlsx

In [None]:
df = pd.read_excel('check_laudo.xlsx')

Separando texto e label para treino

In [None]:
texts = df['laudo'].tolist()
labels = df['laudo alterado'].tolist()

# Tokenização


Chamando o tokenizer

In [None]:
tokenizer = BertTokenizer.from_pretrained('neuralmind/bert-base-portuguese-cased')

# Configurando o Dataset

In [13]:
class LaudoDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_len=512):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        encoding = self.tokenizer(
            self.texts[idx],
            max_length=self.max_len,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        item = {key: val.squeeze(0) for key, val in encoding.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item


# Split dos dados


In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42, stratify=labels
)

train_dataset = LaudoDataset(X_train, y_train, tokenizer)
test_dataset = LaudoDataset(X_test, y_test, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=8)

# Modelo


In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = BertForSequenceClassification.from_pretrained('neuralmind/bert-base-portuguese-cased', num_labels=2)
model.to(device)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at neuralmind/bert-base-portuguese-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(29794, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e

# Otimizador


In [None]:
optimizer = AdamW(model.parameters(), lr=2e-5)

# Treino

In [17]:
epochs = 5
model.train()

train_losses = []

for epoch in range(epochs):
    print(f'\nEpoch {epoch+1}/{epochs}')
    total_loss = 0

    # Barra de progresso
    progress_bar = tqdm(train_loader, desc='Training', leave=True)

    start_time = time.time()

    for batch in progress_bar:
        optimizer.zero_grad()

        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        total_loss += loss.item()

        loss.backward()
        optimizer.step()

        # Atualizar barra com o loss atual
        progress_bar.set_postfix({'Batch Loss': f'{loss.item():.4f}'})

    avg_loss = total_loss / len(train_loader)
    train_losses.append(avg_loss)

    epoch_time = time.time() - start_time
    print(f"Epoch {epoch+1} completed - Loss: {avg_loss:.4f} - Time: {epoch_time:.2f} sec")



Epoch 1/5


Training: 100%|██████████| 83/83 [1:31:25<00:00, 66.09s/it, Batch Loss=0.4780]   


Epoch 1 completed - Loss: 0.5610 - Time: 5485.75 sec

Epoch 2/5


Training: 100%|██████████| 83/83 [49:34<00:00, 35.84s/it, Batch Loss=0.8633]   


Epoch 2 completed - Loss: 0.3723 - Time: 2974.91 sec

Epoch 3/5


Training: 100%|██████████| 83/83 [43:47<00:00, 31.66s/it, Batch Loss=0.0115]  


Epoch 3 completed - Loss: 0.1768 - Time: 2627.81 sec

Epoch 4/5


Training: 100%|██████████| 83/83 [33:35<00:00, 24.28s/it, Batch Loss=1.9669]


Epoch 4 completed - Loss: 0.1224 - Time: 2015.51 sec

Epoch 5/5


Training: 100%|██████████| 83/83 [32:28<00:00, 23.47s/it, Batch Loss=0.0038]

Epoch 5 completed - Loss: 0.0582 - Time: 1948.07 sec





# Avaliação


In [None]:
model.eval()
y_true = []
y_pred = []

with torch.no_grad():
    for batch in test_loader:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        preds = torch.argmax(logits, dim=1)

        y_true.extend(labels.cpu().numpy())
        y_pred.extend(preds.cpu().numpy())

print("Acurácia:", accuracy_score(y_true, y_pred))
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

Acurácia: 0.8787878787878788
[[ 38  17]
 [  3 107]]
              precision    recall  f1-score   support

           0       0.93      0.69      0.79        55
           1       0.86      0.97      0.91       110

    accuracy                           0.88       165
   macro avg       0.89      0.83      0.85       165
weighted avg       0.88      0.88      0.87       165



# Salvar modelo e tokenizer


In [None]:
model.save_pretrained('modelo_laudos')
tokenizer.save_pretrained('modelo_laudos')


('modelo_laudos\\tokenizer_config.json',
 'modelo_laudos\\special_tokens_map.json',
 'modelo_laudos\\vocab.txt',
 'modelo_laudos\\added_tokens.json')