In [119]:
import re
import torch
import pandas as pd
import torch.nn as nn
from trainer import Trainer
from tokenizer import MostFrequentWordsTokenizer
from collections import Counter
from torch.utils.data import Dataset, DataLoader
from model import MultiHeadAttention, FeedForward
from sklearn.model_selection import train_test_split

### Configuration & Hyperparameters

In [120]:
VOCAB_SIZE = 512
D_MODEL = 128
N_HEADS = 4
N_LAYERS = 2
DROPOUT = 0.2
MAX_EPOCHS = 25
EVAL_INTERVAL = 100
BATCH_SIZE = 64
BLOCK_SIZE = 256
LEARNING_RATE = 1e-5
DEVICE = 'cuda' if torch.cuda.is_available() else 'cpu'

### Data Loading and Preprocessing

In [121]:
df = pd.read_csv('data/skincancer.csv')

In [122]:
df['sentence'][0]

'Patient History: Age: 8, Lesion region: arm, Lesion grew: false, Lesion itch: false, Lesion bled: false, Lesion hurt: false, Lesion changed: false, Lesion elevation: false.'

In [123]:
df['diagnostic'].value_counts()

diagnostic
BCC    845
ACK    730
NEV    244
SEK    235
SCC    192
MEL     52
Name: count, dtype: int64

In [124]:
df.loc[:, 'diagnostic'] = \
df['diagnostic'].map({
  'BCC': 'malignant',
  'SCC': 'malignant',
  'ACK': 'benign',
  'NEV': 'benign',
  'SEK': 'benign',
  'MEL': 'malignant'
})

### Create a mapping from diagnostic strings to integers

In [125]:
labels = sorted(df['diagnostic'].unique())
label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}
df['label'] = df['diagnostic'].map(label_to_int)

### Split data

In [126]:
NUM_CLASSES = len(labels)

X_train, X_val, y_train, y_val = train_test_split(
    df['sentence'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Number of classes: {NUM_CLASSES}")

Training set size: 1838
Validation set size: 460
Number of classes: 2


### Simple Word-Level Tokenizer

In [127]:
tokenizer = MostFrequentWordsTokenizer(vocab_size=VOCAB_SIZE)
tokenizer.build_vocab(X_train)
print(f"\nVocabulary size: {tokenizer.get_vocab_size()}")


Vocabulary size: 190


### PyTorch Dataset

In [128]:
class SkinLesionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels.iloc[idx]

        tokens = self.tokenizer.tokenize(sentence)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        token_ids = self.tokenizer.encode(['[CLS]']) + token_ids
        token_ids = token_ids[:self.max_len]

        id_pad_token = self.tokenizer.encode(['[PAD]'])
        padding_len = self.max_len - len(token_ids)
        token_ids = token_ids + id_pad_token * padding_len
        attention_mask = [1 if id != id_pad_token else 0 for id in token_ids]
        attention_mask = torch.tensor(attention_mask, dtype=torch.long).unsqueeze(0)

        return {
            'ids': torch.tensor(token_ids, dtype=torch.long),
            'mask': attention_mask,
            'label': torch.tensor(label, dtype=torch.long)
        }
    
train_dataset = SkinLesionDataset(X_train, y_train, tokenizer, BLOCK_SIZE)
val_dataset = SkinLesionDataset(X_val, y_val, tokenizer, BLOCK_SIZE)
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)


In [129]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

In [130]:
class MiniBERT(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, num_classes, max_len, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_len, d_model)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, num_classes)
        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(DEVICE)

    def forward(self, src, src_mask):
        batch_size, seq_len = src.shape
        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(DEVICE)
        tok_emb = self.token_embedding(src) * self.scale
        pos_emb = self.position_embedding(pos)
        x = self.dropout(tok_emb + pos_emb)
        for layer in self.layers:
            x = layer(x, src_mask)
        cls_output = x[:, 0, :]
        return self.fc_out(cls_output)

In [131]:
model = MiniBERT(
    vocab_size=tokenizer.get_vocab_size(),
    d_model=D_MODEL,
    n_layers=N_LAYERS,
    n_heads=N_HEADS,
    num_classes=NUM_CLASSES,
    max_len=BLOCK_SIZE,
    dropout=DROPOUT
).to(DEVICE)

trainer = Trainer(device=DEVICE)
trainer.fit(model,
            LEARNING_RATE,
            MAX_EPOCHS,
            None,
            train_loader,
            val_loader,
            NUM_CLASSES,
            int_to_label
        )


Starting training...
Epoch: 01 | Train Loss: 0.720 | Val. Loss: 0.700 | Val. Recall: 48.97%
Epoch: 02 | Train Loss: 0.689 | Val. Loss: 0.666 | Val. Recall: 58.60%
Epoch: 03 | Train Loss: 0.675 | Val. Loss: 0.629 | Val. Recall: 71.32%
Epoch: 04 | Train Loss: 0.655 | Val. Loss: 0.586 | Val. Recall: 79.75%
Epoch: 05 | Train Loss: 0.626 | Val. Loss: 0.561 | Val. Recall: 79.75%
Epoch: 06 | Train Loss: 0.604 | Val. Loss: 0.547 | Val. Recall: 79.75%
Epoch: 07 | Train Loss: 0.571 | Val. Loss: 0.539 | Val. Recall: 79.75%
Epoch: 08 | Train Loss: 0.538 | Val. Loss: 0.529 | Val. Recall: 79.75%
Epoch: 09 | Train Loss: 0.502 | Val. Loss: 0.523 | Val. Recall: 79.75%
Epoch: 10 | Train Loss: 0.473 | Val. Loss: 0.521 | Val. Recall: 79.75%
Epoch: 11 | Train Loss: 0.455 | Val. Loss: 0.520 | Val. Recall: 79.75%
Epoch: 12 | Train Loss: 0.446 | Val. Loss: 0.514 | Val. Recall: 79.75%
Epoch: 13 | Train Loss: 0.421 | Val. Loss: 0.516 | Val. Recall: 79.75%
Epoch: 14 | Train Loss: 0.418 | Val. Loss: 0.524 | Val.