In [120]:
import torch
import pandas as pd
import torch.nn as nn
from trainer import Trainer
from tokenizer import MostFrequentWordsTokenizer
from torch.utils.data import Dataset, DataLoader
from model import MultiHeadAttention, FeedForward
from sklearn.model_selection import train_test_split
from utils import plot_interactive_roc_curve, get_class_weights


### Configuration & Hyperparameters

In [121]:
VOCAB_SIZE = 512
D_MODEL = 384
N_HEADS = 6
N_LAYERS = 2
DROPOUT = 0.1
MAX_EPOCHS = 25
BATCH_SIZE = 32
BLOCK_SIZE = 256
LEARNING_RATE = 5e-5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

### Data Loading and Preprocessing

In [133]:
df = pd.read_csv('data/skincancer.csv')
df['sentence'][0]

medical_specialty
 Surgery                          1103
 Consult - History and Phy.        516
 Cardiovascular / Pulmonary        372
 Orthopedic                        355
 Radiology                         273
 General Medicine                  259
 Gastroenterology                  230
 Neurology                         223
 SOAP / Chart / Progress Notes     166
 Obstetrics / Gynecology           160
 Urology                           158
 Discharge Summary                 108
 ENT - Otolaryngology               98
 Neurosurgery                       94
 Hematology - Oncology              90
 Ophthalmology                      83
 Nephrology                         81
 Emergency Room Reports             75
 Pediatrics - Neonatal              70
 Pain Management                    62
 Psychiatry / Psychology            53
 Office Notes                       51
 Podiatry                           47
 Dermatology                        29
 Cosmetic / Plastic Surgery         27
 Dentis

In [123]:
df['diagnostic'].value_counts()

diagnostic
BCC    845
ACK    730
NEV    244
SEK    235
SCC    192
MEL     52
Name: count, dtype: int64

In [124]:
df.loc[:, 'diagnostic'] = \
  df['diagnostic'].map({
    'BCC': 'malignant',
    'SCC': 'malignant',
    'ACK': 'benign',
    'NEV': 'benign',
    'SEK': 'benign',
    'MEL': 'malignant'
  })

" df.loc[:, 'diagnostic'] =   df['diagnostic'].map({\n    'BCC': 'malignant',\n    'SCC': 'malignant',\n    'ACK': 'benign',\n    'NEV': 'benign',\n    'SEK': 'benign',\n    'MEL': 'malignant'\n  }) "

### Create a mapping from diagnostic strings to integers

In [125]:
labels = sorted(df['diagnostic'].unique())
label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}
df['label'] = df['diagnostic'].map(label_to_int)

### Split data

In [126]:
NUM_CLASSES = len(labels)

X_train, X_val, y_train, y_val = train_test_split(
    df['sentence'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Number of classes: {NUM_CLASSES}")

Training set size: 1838
Validation set size: 460
Number of classes: 6


### Simple Word-Level Tokenizer

In [127]:
tokenizer = MostFrequentWordsTokenizer(vocab_size=VOCAB_SIZE)
tokenizer.build_vocab(X_train)
print(f"\nVocabulary size: {tokenizer.get_vocab_size()}")


Vocabulary size: 189


### PyTorch Dataset

In [None]:
class SkinLesionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels.iloc[idx]

        tokens = self.tokenizer.tokenize(sentence)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        token_ids = self.tokenizer.encode(['[CLS]']) + token_ids
        token_ids = token_ids[:self.max_len]

        id_pad_token = self.tokenizer.encode(['[PAD]'])
        padding_len = self.max_len - len(token_ids)
        token_ids = token_ids + id_pad_token * padding_len
        attention_mask = [1 if id != id_pad_token else 0 for id in token_ids]
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)

        return {
            'text': torch.tensor(token_ids, dtype=torch.long),
            'mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }
    
train_dataset = SkinLesionDataset(X_train, y_train, tokenizer, BLOCK_SIZE)
val_dataset = SkinLesionDataset(X_val, y_val, tokenizer, BLOCK_SIZE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [129]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

In [130]:
class MiniBERT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, num_classes, max_len, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_len, d_model)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, num_classes)
        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(DEVICE)

    def forward(self, src, src_mask):
        batch_size, seq_len = src.shape
        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(DEVICE)
        tok_emb = self.token_embedding(src) * self.scale
        pos_emb = self.position_embedding(pos)
        x = self.dropout(tok_emb + pos_emb)
        for layer in self.layers:
            x = layer(x, src_mask)
        cls_output = x[:, 0, :]
        return self.fc_out(cls_output)

In [131]:
weights = get_class_weights(NUM_CLASSES, df, y_train.values, int_to_label, DEVICE)

Class Weights:
  - Class 'ACK': 3.15
  - Class 'BCC': 2.72
  - Class 'MEL': 43.76
  - Class 'NEV': 9.43
  - Class 'SCC': 12.01
  - Class 'SEK': 9.78


In [132]:
model = MiniBERT(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    num_classes=NUM_CLASSES if NUM_CLASSES > 2 else 1,
    max_len=BLOCK_SIZE,
    dropout=DROPOUT
).to(DEVICE)

trainer = Trainer(device=DEVICE, save_name='mini_bert.pth', num_classes=NUM_CLASSES, weights=weights)
trainer.fit(model,
            LEARNING_RATE,
            MAX_EPOCHS,
            weights,
            train_loader,
            val_loader,
            NUM_CLASSES,
            int_to_label,
        )


Starting training...


Epoch: 01 | Train Loss: 1.324 | Val. Loss: 1.207 | Val. Recall: 26.14% | Val. Precision: 18.16% | Val. FPR: 11.61%
Epoch: 02 | Train Loss: 1.218 | Val. Loss: 1.176 | Val. Recall: 25.96% | Val. Precision: 18.00% | Val. FPR: 11.71%
Epoch: 03 | Train Loss: 1.190 | Val. Loss: 1.180 | Val. Recall: 26.07% | Val. Precision: 18.08% | Val. FPR: 11.66%
Epoch: 04 | Train Loss: 1.164 | Val. Loss: 1.112 | Val. Recall: 26.25% | Val. Precision: 18.03% | Val. FPR: 11.55%
Epoch: 05 | Train Loss: 1.149 | Val. Loss: 1.139 | Val. Recall: 26.79% | Val. Precision: 34.95% | Val. FPR: 11.41%
Epoch: 06 | Train Loss: 1.137 | Val. Loss: 1.108 | Val. Recall: 26.44% | Val. Precision: 18.14% | Val. FPR: 11.44%
Epoch: 07 | Train Loss: 1.122 | Val. Loss: 1.128 | Val. Recall: 27.02% | Val. Precision: 29.57% | Val. FPR: 11.39%
Epoch: 08 | Train Loss: 1.115 | Val. Loss: 1.060 | Val. Recall: 30.95% | Val. Precision: 30.00% | Val. FPR: 10.63%
Epoch: 09 | Train Loss: 1.090 | Val. Loss: 1.079 | Val. Recall: 30.20% | Val. Pr