In [12]:
import torch
import pandas as pd
import torch.nn as nn
from trainer import Trainer
from tokenizer import MostFrequentWordsTokenizer
from torch.utils.data import Dataset, DataLoader
from model import MultiHeadAttention, FeedForward
from sklearn.model_selection import train_test_split
from utils import get_class_weights
from pprint import pprint

## 0. Configuração e hiperparâmetros

In [13]:
VOCAB_SIZE = 2048
D_MODEL = 384
N_HEADS = 6
N_LAYERS = 2
DROPOUT = 0.1
MAX_EPOCHS = 10
BATCH_SIZE = 32
BLOCK_SIZE = 512
LEARNING_RATE = 1e-4
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

## 1 - Carregando o dataset

In [14]:
df = pd.read_csv('data/transcription.csv')
df.dropna(subset=['transcription', 'medical_specialty'], inplace=True)
df = df[df['medical_specialty'].isin(
  [' Cardiovascular / Pulmonary',
   ' Orthopedic', ' Radiology', ' General Medicine', ' Gastroenterology', 'Neurology',
  ])].reset_index(drop=True)
df['medical_specialty'].value_counts()


medical_specialty
 Cardiovascular / Pulmonary    371
 Orthopedic                    355
 Radiology                     273
 General Medicine              259
 Gastroenterology              224
Name: count, dtype: int64

In [15]:
pprint(df['transcription'][50][:475])
print(f'Especialidade: {df["medical_specialty"][50]}')

('CC: ,Headache.,HX:, This 51 y/o RHM was moving furniture several days prior '
 'to presentation when he struck his head (vertex) against a door panel. He '
 'then stepped back and struck his back on a trailer hitch. There was no '
 'associated LOC but he felt "dazed." He complained a HA since the accident. '
 'The following day he began experiencing episodic vertigo lasting several '
 'minutes with associated nausea and vomiting. He has been lying in bed most '
 'of the time since the accident.')
Especialidade:  Radiology


## 2. Pré-processamento

### 2.1 Transformando especialidade em números

In [16]:
labels = sorted(df['medical_specialty'].unique())

label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}

df['label'] = df['medical_specialty'].map(label_to_int)

In [17]:
NUM_CLASSES = len(labels)
print(f"Número de classes: {NUM_CLASSES}")

Número de classes: 5


### 2.2 Divindo conjunto de treino e validação

In [18]:
X_train, X_val, y_train, y_val = train_test_split(
    df['transcription'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Treino: {len(X_train)}")
print(f"Validação: {len(X_val)}")

Treino: 1185
Validação: 297


### 2.3 Tokenizando o texto

In [19]:
tokenizer = MostFrequentWordsTokenizer(vocab_size=VOCAB_SIZE)
tokenizer.build_vocab(X_train)
print(f"\nTamanho do vocabulário: {tokenizer.get_vocab_size()}")

tokenizer.tokenize("Joe broke his leg playing soccer.")


Tamanho do vocabulário: 2048


['joe', 'broke', 'his', 'leg', 'playing', 'soccer']

### 2.4 Criando um dataset para carregar o texto

In [20]:
class SkinLesionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, block_size):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels.iloc[idx]

        # Tokeniza a frase
        token_ids = self.tokenizer.encode(sentence)

        # Trunca frases maiores que block_size
        token_ids = token_ids[:self.block_size] 

        # Completa frases menores que max_len com [PAD]
        id_pad_token = self.tokenizer.convert_tokens_to_ids(['[PAD]'])
        padding_len = self.block_size - len(token_ids) 
        token_ids = token_ids + id_pad_token * padding_len

        # Cria máscara de atenção para ignorar tokens de padding
        attention_mask = [1 if id != id_pad_token else 0 for id in token_ids]
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)

        return {
            'text': torch.tensor(token_ids, dtype=torch.long),
            'mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = SkinLesionDataset(X_train, y_train, tokenizer, BLOCK_SIZE)
val_dataset = SkinLesionDataset(X_val, y_val, tokenizer, BLOCK_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


## 3. Bidirectional Encoder Representations from Transformers (BERT)

### 3.1 Implementando Encoder do Transformer

In [21]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

### 3.2 Implementando BERT

In [None]:
class BERT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, num_classes, block_size, dropout):
        super().__init__()
        self.input_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(block_size, d_model)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.embed_layer_norm = nn.LayerNorm(d_model)
        self.linear = nn.Linear(d_model, num_classes)
        self.register_buffer('scale', torch.sqrt(torch.FloatTensor([d_model])))
        self.register_buffer('pos', torch.arange(block_size).unsqueeze(0))

    def forward(self, src, src_mask):
        input_emb = self.input_embedding(src) * self.scale
        positional_emb = self.position_embedding(self.pos)
        x = self.dropout(self.embed_layer_norm(input_emb + positional_emb))
        for layer in self.layers:
            x = layer(x, src_mask)
        cls_output = x[:, 0, :]
        return self.linear(cls_output)

## 4. Treinamento

### 4.1 Defindo pesos para cada especialidade

In [23]:
weights = get_class_weights(NUM_CLASSES, df, y_train.values, int_to_label, DEVICE)

Peso da classe:
' Cardiovascular / Pulmonary': 3.99
' Gastroenterology': 6.62
' General Medicine': 5.72
' Orthopedic': 4.17
' Radiology': 5.44


### 4.2 Otimizando o modelo

In [24]:
model = BERT(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    num_classes=NUM_CLASSES,
    block_size=BLOCK_SIZE,
    dropout=DROPOUT
).to(DEVICE)

trainer = Trainer(device=DEVICE, save_name='mini_bert_anamnese.pth', num_classes=NUM_CLASSES, weights=weights)
trainer.fit(model,
            LEARNING_RATE,
            MAX_EPOCHS,
            weights,
            train_loader,
            val_loader,
            NUM_CLASSES,
            int_to_label,
        )


Iniciando Treinamento...
Epoch: 01 | Train Loss: 1.610 | Val. Loss: 1.589 | Val. Recall: 19.82% | Val. FPR: 20.06%
Epoch: 02 | Train Loss: 1.483 | Val. Loss: 1.472 | Val. Recall: 36.99% | Val. FPR: 15.70%
Epoch: 03 | Train Loss: 1.422 | Val. Loss: 1.311 | Val. Recall: 49.10% | Val. FPR: 12.75%
Epoch: 04 | Train Loss: 1.282 | Val. Loss: 1.213 | Val. Recall: 52.14% | Val. FPR: 12.04%
Epoch: 05 | Train Loss: 1.286 | Val. Loss: 1.161 | Val. Recall: 57.02% | Val. FPR: 10.99%
Epoch: 06 | Train Loss: 1.092 | Val. Loss: 1.106 | Val. Recall: 56.47% | Val. FPR: 10.96%
Epoch: 07 | Train Loss: 0.978 | Val. Loss: 1.043 | Val. Recall: 60.80% | Val. FPR: 9.91%
Epoch: 08 | Train Loss: 0.963 | Val. Loss: 1.044 | Val. Recall: 58.94% | Val. FPR: 9.95%
Epoch: 09 | Train Loss: 0.886 | Val. Loss: 0.946 | Val. Recall: 63.44% | Val. FPR: 9.03%
Epoch: 10 | Train Loss: 0.816 | Val. Loss: 0.973 | Val. Recall: 63.12% | Val. FPR: 8.86%


In [67]:
val_loader = DataLoader(val_dataset, batch_size=1, shuffle=True)
out = next(iter(val_loader))
text, mask, label = out['text'].to(DEVICE), out['mask'].to(DEVICE), out['label'].to(DEVICE)

model.eval()
with torch.no_grad():
    pprint(tokenizer.decode(text[0, :].cpu().numpy().tolist()))
    outputs = model(text, mask)
    predictions = torch.argmax(outputs, dim=1)
    print(f'Predição: {[int_to_label[p.item()] for p in predictions]}')
    print(f'Verdadeiro: {[int_to_label[l.item()] for l in label]}')

('[CLS] subjective review of the medical [UNK] shows that the patient is a '
 '[UNK] female patient who has been admitted and has been treated for [UNK] '
 '[UNK] pneumonia along with copd [UNK] the patient does have a [UNK] history '
 'of copd however she does not use oxygen at her [UNK] [UNK] living home '
 'yesterday she had made improvement since being here at the hospital she '
 'needed oxygen she was [UNK] for home o2 and [UNK] for it yesterday also her '
 'lungs were very [UNK] she did have [UNK] bilaterally and [UNK] on the right '
 'side [UNK] she appeared to be a bit [UNK] and although she was [UNK] to be '
 'discharged home she did not appear to be fit for [UNK] the patient needed to '
 'use the rest room she stated that she needed to [UNK] she [UNK] decided not '
 'to call for [UNK] she stated that she did have her [UNK] call light [UNK] '
 'next to her and she was unable to [UNK] access to her [UNK] she attempted to '
 'walk to the rest room on her [UNK] she sustained a fa