In [73]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, recall_score
from sklearn.utils.class_weight import compute_class_weight
from collections import Counter
import re
from PIL import Image
import os

In [74]:
# --- 1. Configuration for Multimodal Model ---
CONFIG = {
    "max_text_len": 128,   # Max length for text tokens
    "image_size": 224,
    "patch_size": 16,
    "d_model": 128,        # Must be consistent for both modalities
    "n_heads": 4,
    "n_layers": 2,
    "batch_size": 16,      # Smaller batch size for larger model
    "epochs": 25,
    "lr": 1e-4,
    "dropout": 0.1,
    "vocab_size": 5000,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

In [75]:
# --- IMPORTANT: Update this path to your folder containing the images ---
IMAGE_DIR = '/home/pedrobouzon/life/datasets/pad-ufes-20/images/'

In [76]:
# --- 2. Data Loading and Preprocessing ---
print("Loading CSV data...")
df = pd.read_csv('data.csv')

Loading CSV data...


In [77]:
#df.loc[:, 'diagnostic'] = 
df['diagnostic'].map({
  'BCC': 'malignant',
  'SCC': 'malignant',
  'ACK': 'benign',
  'NEV': 'benign',
  'SEK': 'benign',
  'MEL': 'malignant'
})

0          benign
1       malignant
2          benign
3          benign
4       malignant
          ...    
2293       benign
2294    malignant
2295       benign
2296    malignant
2297       benign
Name: diagnostic, Length: 2298, dtype: object

In [78]:
labels = sorted(df['diagnostic'].unique())
label_to_int = {label: i for i, label in enumerate(labels)}
int_to_label = {i: label for label, i in label_to_int.items()}
df['label'] = df['diagnostic'].map(label_to_int)
NUM_CLASSES = len(labels)

In [79]:
train_df, val_df = train_test_split(df, test_size=0.2, random_state=42, stratify=df['label'])

In [80]:
# --- 3. Tokenizer and Dataset ---
# (Using the same SimpleTokenizer from the BERT example)
class SimpleTokenizer:
    def __init__(self, vocab_size):
        self.vocab_size = vocab_size
        self.word_to_idx = {}
    def build_vocab(self, sentences):
        words = []
        for sentence in sentences:
            words.extend(str(sentence).lower().split())
        word_counts = Counter(words)
        most_common_words = word_counts.most_common(self.vocab_size - 4)
        self.word_to_idx = {'[PAD]': 0, '[UNK]': 1, '[CLS]': 2, '[SEP]': 3}
        for i, (word, _) in enumerate(most_common_words, 4): self.word_to_idx[word] = i
    def tokenize(self, sentence): return str(sentence).lower().split()
    def convert_tokens_to_ids(self, tokens): return [self.word_to_idx.get(t, 1) for t in tokens]

tokenizer = SimpleTokenizer(vocab_size=CONFIG['vocab_size'])
tokenizer.build_vocab(train_df['sentence'])

In [81]:
# ✨ New MultiModal Dataset ✨
class MultiModalDataset(Dataset):
    def __init__(self, dataframe, image_dir, tokenizer, max_text_len, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.tokenizer = tokenizer
        self.transform = transform
        self.max_text_len = max_text_len

    def __len__(self): return len(self.dataframe)

    def __getitem__(self, idx):
        row = self.dataframe.iloc[idx]
        # Text processing
        text = row['sentence']
        text_tokens = self.tokenizer.tokenize(text)
        text_ids = self.tokenizer.convert_tokens_to_ids(text_tokens)[:self.max_text_len]
        padding_len = self.max_text_len - len(text_ids)
        text_ids = text_ids + [self.tokenizer.word_to_idx['[PAD]']] * padding_len
        # Image processing
        img_name = row['img_id']
        img_path = os.path.join(self.image_dir, img_name)
        image = Image.open(img_path).convert("RGB")
        if self.transform: image = self.transform(image)
        # Label
        label = row['label']
        return {
            'text_ids': torch.tensor(text_ids, dtype=torch.long),
            'image': image,
            'label': torch.tensor(label, dtype=torch.long)
        }

transform = transforms.Compose([
    transforms.Resize((CONFIG['image_size'], CONFIG['image_size'])),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
train_dataset = MultiModalDataset(train_df, IMAGE_DIR, tokenizer, CONFIG['max_text_len'], transform)
val_dataset = MultiModalDataset(val_df, IMAGE_DIR, tokenizer, CONFIG['max_text_len'], transform)
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False)

In [82]:
# --- 4. Reusing and Building the Model ---
# Reusing previously defined modules: SingleHeadAttention, MultiHeadAttention, FeedForward, TransformerEncoderBlock, PatchEmbedding
class SingleHeadAttention(nn.Module):
    def __init__(self, d_model, head_dim, dropout):
        super().__init__()
        self.head_dim = head_dim
        self.fc_q, self.fc_k, self.fc_v = [nn.Linear(d_model, head_dim) for _ in range(3)]
        self.dropout, self.scale = nn.Dropout(dropout), torch.sqrt(torch.FloatTensor([head_dim])).to(CONFIG['device'])
    def forward(self, x, mask=None):
        Q, K, V = self.fc_q(x), self.fc_k(x), self.fc_v(x)
        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        if mask is not None: energy = energy.masked_fill(mask.unsqueeze(1) == 0, -1e10)
        attention = torch.softmax(energy, dim=-1)
        return torch.matmul(self.dropout(attention), V)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        self.d_model, self.n_heads = d_model, n_heads
        self.head_dim = d_model // n_heads
        self.heads = nn.ModuleList([SingleHeadAttention(d_model, self.head_dim, dropout) for _ in range(n_heads)])
        self.fc_out = nn.Linear(d_model, d_model)
    def forward(self, x, mask=None):
        head_outputs = [head(x, mask) for head in self.heads]
        concatenated = torch.cat(head_outputs, dim=-1)
        return self.fc_out(concatenated)

class FeedForward(nn.Module):
    def __init__(self, d_model, ff_dim, dropout):
        super().__init__()
        self.linear_1, self.linear_2 = nn.Linear(d_model, ff_dim), nn.Linear(ff_dim, d_model)
        self.dropout, self.relu = nn.Dropout(dropout), nn.ReLU()
    def forward(self, x): return self.linear_2(self.dropout(self.relu(self.linear_1(x))))

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1, self.norm2 = nn.LayerNorm(d_model), nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))

class PatchEmbedding(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, d_model):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.projection = nn.Conv2d(in_channels, d_model, kernel_size=patch_size, stride=patch_size)
    def forward(self, x): return self.projection(x).flatten(2).transpose(1, 2)

# ✨ New MultiModal Transformer ✨
class MultiModalTransformer(nn.Module):
    def __init__(self, num_classes, vocab_size, max_text_len, image_size, patch_size,
                 d_model, n_layers, n_heads, dropout):
        super().__init__()
        # Text components
        self.text_embedding = nn.Embedding(vocab_size, d_model)
        # Image components
        self.patch_embedding = PatchEmbedding(image_size, patch_size, 3, d_model)
        # Shared components
        num_patches = self.patch_embedding.num_patches
        total_seq_len = max_text_len + num_patches + 1 # +1 for [CLS] token
        self.position_embedding = nn.Parameter(torch.randn(1, total_seq_len, d_model))
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        # Modality embedding to distinguish text vs. image
        self.modality_embedding = nn.Embedding(2, d_model) # 0 for text, 1 for image
        
        self.transformer_encoder = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.mlp_head = nn.Sequential(nn.LayerNorm(d_model), nn.Linear(d_model, num_classes))
        self.dropout = nn.Dropout(dropout)

    def forward(self, text_ids, image):
        batch_size = image.shape[0]
        
        # 1. Process Text
        text_embeds = self.text_embedding(text_ids) # [batch_size, text_len, d_model]
        text_modality = torch.zeros_like(text_ids, dtype=torch.long).to(CONFIG['device'])
        text_modality_embeds = self.modality_embedding(text_modality)
        
        # 2. Process Image
        patch_embeds = self.patch_embedding(image) # [batch_size, num_patches, d_model]
        image_modality = torch.ones(patch_embeds.shape[:2], dtype=torch.long).to(CONFIG['device'])
        image_modality_embeds = self.modality_embedding(image_modality)
        
        # 3. Combine sequences
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        
        # Add modality embeddings before concatenation
        text_embeds += text_modality_embeds
        patch_embeds += image_modality_embeds
        
        x = torch.cat((cls_tokens, text_embeds, patch_embeds), dim=1)
        # x shape: [batch_size, 1 + text_len + num_patches, d_model]
        
        # 4. Add positional embedding
        x += self.position_embedding
        x = self.dropout(x)
        
        # 5. Pass through Transformer Encoder
        for layer in self.transformer_encoder:
            x = layer(x)
            
        # 6. Get [CLS] token output and classify
        cls_output = x[:, 0]
        return self.mlp_head(cls_output)

In [83]:
# --- 5. Training and Evaluation ---
model = MultiModalTransformer(
    num_classes=NUM_CLASSES,
    vocab_size=len(tokenizer.word_to_idx),
    max_text_len=CONFIG['max_text_len'],
    image_size=CONFIG['image_size'],
    patch_size=CONFIG['patch_size'],
    d_model=CONFIG['d_model'],
    n_layers=CONFIG['n_layers'],
    n_heads=CONFIG['n_heads'],
    dropout=CONFIG['dropout']
).to(CONFIG['device'])

optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'])
weights = 1 / (torch.bincount(torch.tensor(train_df['label'].values)) / len(train_df)).to(CONFIG['device'])
criterion = nn.CrossEntropyLoss(weight=weights)

def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for batch in iterator:
        text_ids = batch['text_ids'].to(CONFIG['device'])
        image = batch['image'].to(CONFIG['device'])
        label = batch['label'].to(CONFIG['device'])
        
        optimizer.zero_grad()
        output = model(text_ids, image)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

# ✨ New Evaluate Function ✨
def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for batch in iterator:
            text_ids = batch['text_ids'].to(CONFIG['device'])
            image = batch['image'].to(CONFIG['device'])
            label = batch['label'].to(CONFIG['device'])
            output = model(text_ids, image)
            loss = criterion(output, label)
            epoch_loss += loss.item()
            preds = torch.argmax(output, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(label.cpu().numpy())
    accuracy = recall_score(all_labels, all_preds, average='macro', zero_division=0)
    return epoch_loss / len(iterator), accuracy, all_preds, all_labels

# ✨ Full Training and Evaluation Loop ✨
print("\nStarting MultiModal Transformer training...")
# The loop below will only work if you have the images and have set the IMAGE_DIR correctly.
# It is commented out to prevent errors in this environment.
min_loss = float('inf')
for epoch in range(CONFIG['epochs']):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy, _, _ = evaluate(model, val_loader, criterion)
    if val_loss < min_loss:
        min_loss = val_loss
        torch.save(model.state_dict(), 'best_multimodal_model.pth')
    print(f"Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {val_loss:.3f} | Val. Acc: {val_accuracy*100:.2f}%")

state_dict = torch.load('best_multimodal_model.pth')
model.load_state_dict(state_dict, strict=True)
# ✨ Final Evaluation with Classification Report ✨
print("\nFinal MultiModal Evaluation...")
# Note: This will only produce meaningful results after the training loop has run.
_, _, val_preds, val_labels = evaluate(model, val_loader, criterion)
report = classification_report(
    val_labels,
    val_preds,
    target_names=[int_to_label[i] for i in range(NUM_CLASSES)],
    zero_division=0
)
print(report)


Starting MultiModal Transformer training...


Epoch: 01 | Train Loss: 1.800 | Val. Loss: 1.688 | Val. Acc: 22.44%
Epoch: 02 | Train Loss: 1.546 | Val. Loss: 1.408 | Val. Acc: 30.96%
Epoch: 03 | Train Loss: 1.338 | Val. Loss: 1.230 | Val. Acc: 50.05%
Epoch: 04 | Train Loss: 1.257 | Val. Loss: 1.107 | Val. Acc: 54.02%
Epoch: 05 | Train Loss: 1.159 | Val. Loss: 1.041 | Val. Acc: 54.15%
Epoch: 06 | Train Loss: 1.135 | Val. Loss: 1.063 | Val. Acc: 51.85%
Epoch: 07 | Train Loss: 1.095 | Val. Loss: 1.046 | Val. Acc: 58.73%
Epoch: 08 | Train Loss: 1.065 | Val. Loss: 0.973 | Val. Acc: 56.28%
Epoch: 09 | Train Loss: 1.068 | Val. Loss: 0.987 | Val. Acc: 61.79%
Epoch: 10 | Train Loss: 1.046 | Val. Loss: 0.978 | Val. Acc: 57.77%
Epoch: 11 | Train Loss: 1.015 | Val. Loss: 0.986 | Val. Acc: 57.65%
Epoch: 12 | Train Loss: 0.990 | Val. Loss: 0.945 | Val. Acc: 59.43%
Epoch: 13 | Train Loss: 0.999 | Val. Loss: 0.962 | Val. Acc: 59.74%
Epoch: 14 | Train Loss: 0.962 | Val. Loss: 0.960 | Val. Acc: 62.53%
Epoch: 15 | Train Loss: 0.954 | Val. Loss: 0.935