In [129]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import re

### Configuration & Hyperparameters

In [None]:
CONFIG = {
    "max_len": 128,
    "batch_size": 32,
    "epochs": 25,
    "lr": 1e-3,
    "vocab_size": 5000,
    "d_model": 128,       # Embedding dimension
    "n_heads": 4,         # Number of attention heads
    "n_layers": 2,        # Number of Transformer blocks
    "dropout": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

### Data Loading and Preprocessing

In [131]:
df = pd.read_csv('data.csv')
df.head()

Unnamed: 0,patient_id,diagnostic,diagnostic_number,img_id,folder,sentence
0,PAT_1516,NEV,3,PAT_1516_1765_530.png,1,"Patient History: Age: 8, Lesion region: arm, L..."
1,PAT_46,BCC,1,PAT_46_881_939.png,5,"Patient History: Age: 55, Gender: female, Mate..."
2,PAT_1545,ACK,0,PAT_1545_1867_547.png,1,"Patient History: Age: 77, Lesion region: face,..."
3,PAT_1989,ACK,0,PAT_1989_4061_934.png,1,"Patient History: Age: 75, Lesion region: hand,..."
4,PAT_684,BCC,1,PAT_684_1302_588.png,1,"Patient History: Age: 79, Gender: male, Matern..."


In [132]:
df['sentence'][0]

'Patient History: Age: 8, Lesion region: arm, Lesion grew: false, Lesion itch: false, Lesion bled: false, Lesion hurt: false, Lesion changed: false, Lesion elevation: false.'

### Create a mapping from diagnostic strings to integers

In [134]:
labels = sorted(df['diagnostic'].unique())
label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}
df['label'] = df['diagnostic'].map(label_to_int)

### Split data

In [135]:
NUM_CLASSES = len(labels)

X_train, X_val, y_train, y_val = train_test_split(
    df['sentence'], df['label'], test_size=0.2, random_state=42, stratify=df['label']
)
print(f"Training set size: {len(X_train)}")
print(f"Validation set size: {len(X_val)}")
print(f"Number of classes: {NUM_CLASSES}")

Training set size: 1838
Validation set size: 460
Number of classes: 6


### Calculate Class Weights for Imbalanced Data

In [136]:
weights = 1 / (torch.bincount(torch.tensor(y_train.values)) / len(y_train.values)).to(CONFIG['device'])

print(f"Class Weights:")
for i, weight in enumerate(weights):
    print(f"  - Class '{int_to_label[i]}': {weight:.2f}")

Class Weights:
  - Class 'ACK': 3.15
  - Class 'BCC': 2.72
  - Class 'MEL': 43.76
  - Class 'NEV': 9.43
  - Class 'SCC': 12.01
  - Class 'SEK': 9.78


### Simple Word-Level Tokenizer

In [137]:
class SimpleTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.word_to_idx = {}
        self.idx_to_word = {}

    def build_vocab(self, sentences):
        def clean_text(text):
            text = text.lower()
            text = re.sub(r'[^a-z0-9\s]', '', text)
            return text

        words = []
        for sentence in sentences:
            words.extend(clean_text(str(sentence)).split())

        word_counts = Counter(words)
        most_common_words = word_counts.most_common(self.vocab_size - 4)

        self.word_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3}
        for i, (word, _) in enumerate(most_common_words, 4):
            self.word_to_idx[word] = i

        self.idx_to_word = {i: w for w, i in self.word_to_idx.items()}

    def tokenize(self, sentence):
        cleaned_sentence = str(sentence).lower()
        cleaned_sentence = re.sub(r'[^a-z0-9\s]', '', cleaned_sentence)
        return cleaned_sentence.split()

    def convert_tokens_to_ids(self, tokens):
        return [self.word_to_idx.get(token, self.word_to_idx['[UNK]']) for token in tokens]
    
tokenizer = SimpleTokenizer(vocab_size=CONFIG['vocab_size'])
tokenizer.build_vocab(X_train)
print(f"\nVocabulary size: {len(tokenizer.word_to_idx)}")


Vocabulary size: 189


### PyTorch Dataset

In [138]:
class SkinLesionDataset(Dataset):
    def __init__(self, sentences, labels, tokenizer, max_len):
        self.sentences = sentences
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = str(self.sentences.iloc[idx])
        label = self.labels.iloc[idx]

        tokens = self.tokenizer.tokenize(sentence)
        token_ids = self.tokenizer.convert_tokens_to_ids(tokens)
        token_ids = [self.tokenizer.word_to_idx['[CLS]']] + token_ids
        token_ids = token_ids[:self.max_len]

        padding_len = self.max_len - len(token_ids)
        token_ids = token_ids + [self.tokenizer.word_to_idx['[PAD]']] * padding_len
        attention_mask = [1 if id != self.tokenizer.word_to_idx['[PAD]'] else 0 for id in token_ids]

        return {
            'ids': torch.tensor(token_ids, dtype=torch.long),
            'mask': torch.tensor(attention_mask, dtype=torch.long),
            'label': torch.tensor(label, dtype=torch.long)
        }
    
train_dataset = SkinLesionDataset(X_train, y_train, tokenizer, CONFIG['max_len'])
val_dataset = SkinLesionDataset(X_val, y_val, tokenizer, CONFIG['max_len'])
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)


In [None]:
class SingleHeadAttention(nn.Module):
    def __init__(self, d_model, head_dim, dropout):
        super().__init__()
        self.head_dim = head_dim
        self.fc_q = nn.Linear(d_model, head_dim)
        self.fc_k = nn.Linear(d_model, head_dim)
        self.fc_v = nn.Linear(d_model, head_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([head_dim])).to(CONFIG['device'])

    def forward(self, x, mask=None):
        Q, K, V = self.fc_q(x), self.fc_k(x), self.fc_v(x)
        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        if mask is not None:
            energy = energy.masked_fill(mask.unsqueeze(1) == 0, -1e10)
        attention = torch.softmax(energy, dim=-1)  # e^{-1e10}/sum ~ 0
        return torch.matmul(self.dropout(attention), V)

In [140]:
class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model, self.n_heads = d_model, n_heads
        self.head_dim = d_model // n_heads
        self.heads = nn.ModuleList([
            SingleHeadAttention(d_model, self.head_dim, dropout)
            for _ in range(n_heads)
        ])
        self.fc_out = nn.Linear(d_model, d_model)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        head_outputs = [head(x, mask) for head in self.heads]
        concatenated = torch.cat(head_outputs, dim=-1)
        return self.fc_out(concatenated)

In [141]:
class FeedForward(nn.Module):
    def __init__(self, d_model, ff_dim, dropout):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, ff_dim)
        self.linear_2 = nn.Linear(ff_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear_2(self.dropout(self.relu(self.linear_1(x))))

In [142]:
class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

In [143]:
class MiniBERTClassifier(nn.Module):
    def __init__(self, vocab_size, d_model, n_layers, n_heads, num_classes, max_len, dropout):
        super().__init__()
        self.token_embedding = nn.Embedding(vocab_size, d_model)
        self.position_embedding = nn.Embedding(max_len, d_model)
        self.layers = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.dropout = nn.Dropout(dropout)
        self.fc_out = nn.Linear(d_model, num_classes)
        self.scale = torch.sqrt(torch.FloatTensor([d_model])).to(CONFIG['device'])

    def forward(self, src, src_mask):
        batch_size, seq_len = src.shape
        pos = torch.arange(0, seq_len).unsqueeze(0).repeat(batch_size, 1).to(CONFIG['device'])
        tok_emb = self.token_embedding(src) * self.scale
        pos_emb = self.position_embedding(pos)
        x = self.dropout(tok_emb + pos_emb)
        for layer in self.layers:
            x = layer(x, src_mask)
        cls_output = x[:, 0, :]
        return self.fc_out(cls_output)

In [144]:
model = MiniBERTClassifier(
    vocab_size=len(tokenizer.word_to_idx),
    d_model=CONFIG['d_model'],
    n_layers=CONFIG['n_layers'],
    n_heads=CONFIG['n_heads'],
    num_classes=NUM_CLASSES,
    max_len=CONFIG['max_len'],
    dropout=CONFIG['dropout']
).to(CONFIG['device'])

optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'])
# ✨ Use the calculated class weights in the loss function ✨
criterion = nn.CrossEntropyLoss(weight=weights)

def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        ids, mask, label = batch['ids'].to(CONFIG['device']), batch['mask'].to(CONFIG['device']), batch['label'].to(CONFIG['device'])
        optimizer.zero_grad()
        output = model(ids, mask)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in iterator:
            ids, mask, label = batch['ids'].to(CONFIG['device']), batch['mask'].to(CONFIG['device']), batch['label'].to(CONFIG['device'])
            output = model(ids, mask)
            loss = criterion(output, label)
            epoch_loss += loss.item()
            preds = torch.argmax(output, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(label.cpu().numpy())
    #accuracy = accuracy_score(all_labels, all_preds)
    accuracy = recall_score(all_labels, all_preds, average='macro')
    return epoch_loss / len(iterator), accuracy, all_preds, all_labels

print("\nStarting training...")
for epoch in range(CONFIG['epochs']):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy, _, _ = evaluate(model, val_loader, criterion)
    print(f"Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {val_loss:.3f} | Val. Recall: {val_accuracy*100:.2f}%")

# --- 8. Final Evaluation ---
print("\nFinal Evaluation...")
_, _, val_preds, val_labels = evaluate(model, val_loader, criterion)
report = classification_report(
    val_labels,
    val_preds,
    target_names=[int_to_label[i] for i in range(NUM_CLASSES)],
    zero_division=0
)
print(report)


Starting training...
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96]

torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 96]) torch.Size([32, 96]) torch.Size([32, 1, 96])
torch.Size([32, 96, 