In [38]:
import torch
import pandas as pd
import torch.nn as nn
from trainer import Trainer
from tokenizer import MostFrequentWordsTokenizer
from torch.utils.data import Dataset, DataLoader
from model import MultiHeadAttention, FeedForward
from sklearn.model_selection import train_test_split
from utils import get_class_weights
from pprint import pprint


## 0. Configuração e hiperparâmetros

In [None]:
VOCAB_SIZE = 2048
D_MODEL = 384
N_HEADS = 6
N_LAYERS = 2
DROPOUT = 0.1
MAX_EPOCHS = 10
BATCH_SIZE = 32
BLOCK_SIZE = 512
LEARNING_RATE = 1e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## 1 - Carregando o dataset

In [40]:
df = pd.read_csv('data/anamnese.csv')
df.dropna(subset=['transcription', 'medical_specialty'], inplace=True)
df = df[df['medical_specialty'].isin(
  [' Cardiovascular / Pulmonary',
   ' Orthopedic', ' Radiology', ' General Medicine', ' Gastroenterology', 'Neurology',
  ])].reset_index(drop=True)
df['medical_specialty'].value_counts()


medical_specialty
 Cardiovascular / Pulmonary    371
 Orthopedic                    355
 Radiology                     273
 General Medicine              259
 Gastroenterology              224
Name: count, dtype: int64

In [41]:
pprint(df['transcription'][50][:475])
print(f'Especialidade: {df["medical_specialty"][50]}')

('CC: ,Headache.,HX:, This 51 y/o RHM was moving furniture several days prior '
 'to presentation when he struck his head (vertex) against a door panel. He '
 'then stepped back and struck his back on a trailer hitch. There was no '
 'associated LOC but he felt "dazed." He complained a HA since the accident. '
 'The following day he began experiencing episodic vertigo lasting several '
 'minutes with associated nausea and vomiting. He has been lying in bed most '
 'of the time since the accident.')
Especialidade:  Radiology


## 2. Pré-processamento

### 2.1 Transformando diagnóstico em números

In [42]:
labels = sorted(df['medical_specialty'].unique())
label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}
df['label'] = df['medical_specialty'].map(label_to_int)

### 2.2 Divindo conjunto de treino e validação

In [43]:
NUM_CLASSES = len(labels)

X_train, X_val, y_train, y_val = train_test_split(
    df['transcription'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Number of classes: {NUM_CLASSES}")

Training set size: 1185
Validation set size: 297
Number of classes: 5


### 2.3 Tokenizando o texto

In [44]:
tokenizer = MostFrequentWordsTokenizer(vocab_size=VOCAB_SIZE)
tokenizer.build_vocab(X_train)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")

tokenizer.tokenize("Joe broke his leg playing soccer.")


Tamanho do vocabulário: 2048


['joe', 'broke', 'his', 'leg', 'playing', 'soccer']

### 2.4 Criando um dataset para carregar o texto

In [45]:
class SkinLesionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels.iloc[idx]

        # Tokeniza a frase
        token_ids = self.tokenizer.encode(sentence)

        # Trunca frases maiores que max_len
        token_ids = token_ids[:self.max_len] 

        # Completa frases menores que max_len com [PAD]
        id_pad_token = self.tokenizer.convert_tokens_to_ids(['[PAD]'])
        padding_len = self.max_len - len(token_ids) 
        token_ids = token_ids + id_pad_token * padding_len

        # Cria máscara de atenção para ignorar tokens de padding
        attention_mask = [1 if id != id_pad_token else 0 for id in token_ids]
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)

        return {
            'text': torch.tensor(token_ids, dtype=torch.long),
            'mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = SkinLesionDataset(X_train, y_train, tokenizer, BLOCK_SIZE)
val_dataset = SkinLesionDataset(X_val, y_val, tokenizer, BLOCK_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


## 3. Bidirectional Encoder Representations from Transformers (BERT)

### 3.1 Implementando Encoder do Transformer

In [46]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

### 3.2 Implementando BERT

In [47]:
class BERT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, num_classes, max_len, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_len, d_model)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, num_classes)
        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(DEVICE)

    def forward(self, src, src_mask):
        batch_size, seq_len = src.shape
        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(DEVICE)
        tok_emb = self.token_embedding(src) * self.scale
        pos_emb = self.position_embedding(pos)
        x = self.dropout(tok_emb + pos_emb)
        for layer in self.layers:
            x = layer(x, src_mask)
        cls_output = x[:, 0, :]
        return self.fc_out(cls_output)

## 4. Treinamento

### 4.1 Defindo pesos para cada especialidade

In [48]:
weights = get_class_weights(NUM_CLASSES, df, y_train.values, int_to_label, DEVICE)

Peso da classe:
' Cardiovascular / Pulmonary': 3.99
' Gastroenterology': 6.62
' General Medicine': 5.72
' Orthopedic': 4.17
' Radiology': 5.44


### 4.2 Otimizando o modelo

In [49]:
model = BERT(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    num_classes=NUM_CLASSES,
    max_len=BLOCK_SIZE,
    dropout=DROPOUT
).to(DEVICE)

trainer = Trainer(device=DEVICE, save_name='mini_bert_anamnese.pth', num_classes=NUM_CLASSES, weights=weights)
trainer.fit(model,
            LEARNING_RATE,
            MAX_EPOCHS,
            weights,
            train_loader,
            val_loader,
            NUM_CLASSES,
            int_to_label,
        )


Iniciando Treinamento...


Epoch: 01 | Train Loss: 1.550 | Val. Loss: 1.386 | Val. Recall: 36.43% | Val. FPR: 15.45%
Epoch: 02 | Train Loss: 1.299 | Val. Loss: 1.209 | Val. Recall: 54.32% | Val. FPR: 11.65%
Epoch: 03 | Train Loss: 1.185 | Val. Loss: 1.220 | Val. Recall: 52.49% | Val. FPR: 11.53%
Epoch: 04 | Train Loss: 1.150 | Val. Loss: 1.051 | Val. Recall: 56.76% | Val. FPR: 10.78%
Epoch: 05 | Train Loss: 1.053 | Val. Loss: 0.987 | Val. Recall: 56.49% | Val. FPR: 10.57%
Epoch: 06 | Train Loss: 1.025 | Val. Loss: 1.026 | Val. Recall: 53.96% | Val. FPR: 11.22%
Epoch: 07 | Train Loss: 0.948 | Val. Loss: 0.912 | Val. Recall: 63.73% | Val. FPR: 8.95%
Epoch: 08 | Train Loss: 0.791 | Val. Loss: 0.857 | Val. Recall: 59.96% | Val. FPR: 9.75%
Epoch: 09 | Train Loss: 0.749 | Val. Loss: 0.961 | Val. Recall: 59.88% | Val. FPR: 10.06%
Epoch: 10 | Train Loss: 0.720 | Val. Loss: 0.970 | Val. Recall: 64.10% | Val. FPR: 8.95%
Epoch: 11 | Train Loss: 0.685 | Val. Loss: 0.976 | Val. Recall: 65.55% | Val. FPR: 8.63%
Epoch: 12 | Tr

In [96]:
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)
out = next(iter(val_loader))
text, mask, label = out['text'].to(DEVICE), out['mask'].to(DEVICE), out['label'].to(DEVICE)

model.eval()
with torch.no_grad():
    pprint(tokenizer.decode(text[0, :].cpu().numpy().tolist()))
    outputs = model(text, mask)
    predictions = torch.argmax(outputs, dim=1)
    print(f'Predição: {[int_to_label[p.item()] for p in predictions]}')
    print(f'Verdadeiro: {[int_to_label[l.item()] for l in label]}')

('[CLS] cc found [UNK] [UNK] [UNK] went to bed at 10 [UNK] at her [UNK] home '
 'on [UNK] she was found [UNK] by her [UNK] the next morning three other [UNK] '
 'in the [UNK] were [UNK] and complained of [UNK] that same morning her last '
 '[UNK] was [UNK] to her [UNK] at [UNK] on [UNK] she next [UNK] [UNK] in the '
 '[UNK] from a hospital initial [UNK] level was 24 normal 15 and [UNK] [UNK] '
 'with [UNK] 75 on [UNK] unknown [UNK] [UNK] [UNK] stroke and [UNK] difficulty '
 'in the past 12 [UNK] [UNK] [UNK] unknown history of [UNK] drug [UNK] [UNK] '
 '[UNK] [UNK] 30 [UNK] oriented to [UNK] only speech without [UNK] 23 [UNK] at '
 '[UNK] [UNK] full strength throughout with normal muscle tone and [UNK] [UNK] '
 '[UNK] not [UNK] on [UNK] exam [UNK] for erythema of the face and [UNK] she '
 'underwent a total of four [UNK] under [UNK] oxygen 2 [UNK] on [UNK] and 2 '
 '[UNK] on [UNK] [UNK] assessment on [UNK] revealed marked [UNK] [UNK] with '
 '[UNK] in [UNK] [UNK] [UNK] [UNK] [UNK] atten