In [14]:
import torch
import pandas as pd
import torch.nn as nn
from trainer import Trainer
from tokenizer import MostFrequentWordsTokenizer
from torch.utils.data import Dataset, DataLoader
from model import MultiHeadAttention, FeedForward
from sklearn.model_selection import train_test_split
from utils import plot_interactive_roc_curve, get_class_weights


### Este notebook apresenta a classificação de câncer de pele apenas com os dados clínicos do paciente (em formato de texto). 

### O intuito é ajudar no desafio multimodal, onde imagem e texto devem ser integrados.

### Veja como o dataloader é implementado

### Configuration & Hyperparameters

In [15]:
VOCAB_SIZE = 512
D_MODEL = 384
N_HEADS = 6
N_LAYERS = 2
DROPOUT = 0.1
MAX_EPOCHS = 25
BATCH_SIZE = 32
BLOCK_SIZE = 256
LEARNING_RATE = 5e-5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

### Data Loading and Preprocessing

In [16]:
df = pd.read_csv('data/skincancer.csv')
df['sentence'][0]

'Patient History: Age: 8, Lesion region: arm, Lesion grew: false, Lesion itch: false, Lesion bled: false, Lesion hurt: false, Lesion changed: false, Lesion elevation: false.'

In [17]:
df['diagnostic'].value_counts()

diagnostic
BCC    845
ACK    730
NEV    244
SEK    235
SCC    192
MEL     52
Name: count, dtype: int64

In [18]:
df.loc[:, 'diagnostic'] = \
  df['diagnostic'].map({
    'BCC': 'malignant',
    'SCC': 'malignant',
    'ACK': 'benign',
    'NEV': 'benign',
    'SEK': 'benign',
    'MEL': 'malignant'
  })

### Create a mapping from diagnostic strings to integers

In [19]:
labels = sorted(df['diagnostic'].unique())
label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}
df['label'] = df['diagnostic'].map(label_to_int)

### Split data

In [20]:
NUM_CLASSES = len(labels)

X_train, X_val, y_train, y_val = train_test_split(
    df['sentence'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Number of classes: {NUM_CLASSES}")

Training set size: 1838
Validation set size: 460
Number of classes: 2


### Simple Word-Level Tokenizer

In [21]:
tokenizer = MostFrequentWordsTokenizer(vocab_size=VOCAB_SIZE)
tokenizer.build_vocab(X_train)
print(f"\nVocabulary size: {tokenizer.get_vocab_size()}")


Vocabulary size: 190


### PyTorch Dataset

In [22]:
class SkinLesionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, block_size):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.block_size = block_size

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels.iloc[idx]

        # Tokeniza a frase
        token_ids = self.tokenizer.encode(sentence)

        # Trunca frases maiores que block_size
        token_ids = token_ids[:self.block_size] 

        # Completa frases menores que max_len com [PAD]
        id_pad_token = self.tokenizer.convert_tokens_to_ids(['[PAD]'])
        padding_len = self.block_size - len(token_ids) 
        token_ids = token_ids + id_pad_token * padding_len

        # Cria máscara de atenção para ignorar tokens de padding
        attention_mask = [1 if id != id_pad_token else 0 for id in token_ids]
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)

        return {
            'text': torch.tensor(token_ids, dtype=torch.long),
            'mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }

train_dataset = SkinLesionDataset(X_train, y_train, tokenizer, BLOCK_SIZE)
val_dataset = SkinLesionDataset(X_val, y_val, tokenizer, BLOCK_SIZE)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [23]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

In [24]:
class MiniBERT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, num_classes, max_len, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_len, d_model)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, num_classes)
        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(DEVICE)

    def forward(self, src, src_mask):
        batch_size, seq_len = src.shape
        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(DEVICE)
        tok_emb = self.token_embedding(src) * self.scale
        pos_emb = self.position_embedding(pos)
        x = self.dropout(tok_emb + pos_emb)
        for layer in self.layers:
            x = layer(x, src_mask)
        cls_output = x[:, 0, :]
        return self.fc_out(cls_output)

In [25]:
weights = get_class_weights(NUM_CLASSES, df, y_train.values, int_to_label, DEVICE)

tensor([1.1102], device='cuda:0', dtype=torch.float64)


In [26]:
model = MiniBERT(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    num_classes=NUM_CLASSES if NUM_CLASSES > 2 else 1,
    max_len=BLOCK_SIZE,
    dropout=DROPOUT
).to(DEVICE)

trainer = Trainer(device=DEVICE, save_name='mini_bert.pth', num_classes=NUM_CLASSES, weights=weights)
trainer.fit(model,
            LEARNING_RATE,
            MAX_EPOCHS,
            weights,
            train_loader,
            val_loader,
            NUM_CLASSES,
            int_to_label,
        )


Iniciando Treinamento...


Epoch: 01 | Train Loss: 0.489 | Val. Loss: 0.469 | Val. Recall: 100.00% | Val. FPR: 40.50%
Epoch: 02 | Train Loss: 0.368 | Val. Loss: 0.494 | Val. Recall: 100.00% | Val. FPR: 40.50%
Epoch: 03 | Train Loss: 0.337 | Val. Loss: 0.379 | Val. Recall: 97.25% | Val. FPR: 37.19%
Epoch: 04 | Train Loss: 0.341 | Val. Loss: 0.373 | Val. Recall: 97.71% | Val. FPR: 38.02%
Epoch: 05 | Train Loss: 0.325 | Val. Loss: 0.414 | Val. Recall: 98.17% | Val. FPR: 38.84%
Epoch: 06 | Train Loss: 0.319 | Val. Loss: 0.351 | Val. Recall: 94.04% | Val. FPR: 29.34%
Epoch: 07 | Train Loss: 0.318 | Val. Loss: 0.397 | Val. Recall: 94.95% | Val. FPR: 31.82%
Epoch: 08 | Train Loss: 0.306 | Val. Loss: 0.354 | Val. Recall: 88.53% | Val. FPR: 23.14%
Epoch: 09 | Train Loss: 0.294 | Val. Loss: 0.367 | Val. Recall: 90.37% | Val. FPR: 26.45%
Epoch: 10 | Train Loss: 0.301 | Val. Loss: 0.358 | Val. Recall: 79.82% | Val. FPR: 18.18%
Epoch: 11 | Train Loss: 0.306 | Val. Loss: 0.351 | Val. Recall: 85.78% | Val. FPR: 22.31%
Epoch: 1