In [197]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
from torchvision.transforms import v2
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import recall_score, classification_report
from PIL import Image
import os
import numpy as np
from einops import rearrange

In [198]:

# --- 1. Configuration & Hyperparameters for ViT ---
CONFIG = {
    "image_size": 224,     # Standard ViT input size
    "patch_size": 16,      # Standard ViT patch size
    "in_channels": 3,      # RGB images
    "d_model": 128,        # Embedding dimension
    "n_heads": 4,          # Number of attention heads
    "n_layers": 2,         # Number of Transformer blocks
    "batch_size": 256,
    "epochs": 25,
    "lr": 1e-3,
    "dropout": 0.1,
    "device": "cuda" if torch.cuda.is_available() else "cpu"
}

In [199]:

# ✨ ==================================================================== ✨
# ✨ IMPORTANT: Update this path to your folder containing the images     ✨
IMAGE_DIR = '/home/pedrobouzon/life/datasets/pad-ufes-20/images/'
# ✨ ==================================================================== ✨

In [200]:
# --- 2. Data Loading and Preprocessing ---
print("Loading CSV data...")
df = pd.read_csv('data.csv')
df

Loading CSV data...


Unnamed: 0,patient_id,diagnostic,diagnostic_number,img_id,folder,sentence
0,PAT_1516,NEV,3,PAT_1516_1765_530.png,1,"Patient History: Age: 8, Lesion region: arm, L..."
1,PAT_46,BCC,1,PAT_46_881_939.png,5,"Patient History: Age: 55, Gender: female, Mate..."
2,PAT_1545,ACK,0,PAT_1545_1867_547.png,1,"Patient History: Age: 77, Lesion region: face,..."
3,PAT_1989,ACK,0,PAT_1989_4061_934.png,1,"Patient History: Age: 75, Lesion region: hand,..."
4,PAT_684,BCC,1,PAT_684_1302_588.png,1,"Patient History: Age: 79, Gender: male, Matern..."
...,...,...,...,...,...,...
2293,PAT_1708,ACK,0,PAT_1708_3156_175.png,1,"Patient History: Age: 73, Lesion region: hand,..."
2294,PAT_46,BCC,1,PAT_46_880_140.png,5,"Patient History: Age: 55, Gender: female, Mate..."
2295,PAT_1343,SEK,5,PAT_1343_1217_404.png,3,"Patient History: Age: 74, Lesion region: forea..."
2296,PAT_326,BCC,1,PAT_326_690_823.png,5,"Patient History: Age: 58, Gender: female, Mate..."


In [201]:
df['diagnostic'].unique()

array(['NEV', 'BCC', 'ACK', 'SEK', 'SCC', 'MEL'], dtype=object)

In [202]:
# Create a mapping from diagnostic strings to integers
labels = sorted(df['diagnostic'].unique())
label_to_int = {label: i for i, label in enumerate(labels)} # -> (0, ACK), (1, BCC), ...
int_to_label = {i: label for label, i in label_to_int.items()} # -> (key, value)
df['label'] = df['diagnostic'].map(label_to_int) # ACK -> 0

In [203]:
CONFIG['num_classes'] = len(labels)

In [204]:
# Split the dataframe into training and validation sets
train_df, val_df = train_test_split(
    df, test_size=0.2, random_state=42, stratify=df['label']
)

In [205]:
train_df['label'].value_counts() / len(train_df)

label
1    0.367791
0    0.317737
3    0.106094
5    0.102285
4    0.083243
2    0.022851
Name: count, dtype: float64

In [206]:
class_percentage_train = train_df['label'].value_counts() / len(train_df)
class_percentage_val = val_df['label'].value_counts() / len(val_df)

np.allclose(class_percentage_train, class_percentage_val, atol=0.01) #

True

In [207]:
# --- 3. Custom PyTorch Dataset for CSV and Images ---
class SkinLesionImageDataset(Dataset):
    def __init__(self, dataframe, image_dir, transform=None):
        self.dataframe = dataframe
        self.image_dir = image_dir
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        # Get image ID and label from the dataframe
        img_name = self.dataframe.iloc[idx]['img_id']
        label = self.dataframe.iloc[idx]['label']

        # Construct the full image path
        img_path = os.path.join(self.image_dir, img_name)

        # Load the image using Pillow
        image = Image.open(img_path).convert("RGB")

        # Apply transformations
        if self.transform:
            image = self.transform(image)

        return image, torch.tensor(label, dtype=torch.long)

# Define image transformations
# We resize to the ViT's expected input size and normalize
transform = transforms.Compose([
    transforms.Resize(size=(CONFIG['image_size'], CONFIG['image_size'])),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5)) # -> [-1, 1]
])

# Create the datasets
train_dataset = SkinLesionImageDataset(train_df, IMAGE_DIR, transform=transform)
val_dataset = SkinLesionImageDataset(val_df, IMAGE_DIR, transform=transform)

# Create the dataloaders
train_loader = DataLoader(train_dataset, batch_size=CONFIG['batch_size'], shuffle=True, num_workers=8)
val_loader = DataLoader(val_dataset, batch_size=CONFIG['batch_size'], shuffle=False, num_workers=4)

In [208]:
class SingleHeadAttention(nn.Module):
    def __init__(self, d_model, head_dim, dropout):
        super().__init__()
        self.head_dim = head_dim
        self.fc_q = nn.Linear(d_model, head_dim)
        self.fc_k = nn.Linear(d_model, head_dim)
        self.fc_v = nn.Linear(d_model, head_dim)
        self.dropout = nn.Dropout(dropout)
        self.scale = torch.sqrt(torch.FloatTensor([head_dim])).to(CONFIG['device'])

    def forward(self, x, mask=None):
        Q, K, V = self.fc_q(x), self.fc_k(x), self.fc_v(x)
        energy = torch.matmul(Q, K.transpose(-2, -1)) / self.scale
        if mask is not None:
            energy = energy.masked_fill(mask.unsqueeze(1) == 0, -1e10)
        attention = torch.softmax(energy, dim=-1)
        return torch.matmul(self.dropout(attention), V)

class MultiHeadAttention(nn.Module):
    def __init__(self, d_model, n_heads, dropout):
        super().__init__()
        assert d_model % n_heads == 0
        self.d_model, self.n_heads = d_model, n_heads
        self.head_dim = d_model // n_heads
        self.heads = nn.ModuleList([
            SingleHeadAttention(d_model, self.head_dim, dropout)
            for _ in range(n_heads)
        ])
        self.fc_out = nn.Linear(d_model, d_model)

    def forward(self, x, mask=None):
        head_outputs = [head(x, mask) for head in self.heads]
        concatenated = torch.cat(head_outputs, dim=-1)
        return self.fc_out(concatenated)

class FeedForward(nn.Module):
    def __init__(self, d_model, ff_dim, dropout):
        super().__init__()
        self.linear_1 = nn.Linear(d_model, ff_dim)
        self.linear_2 = nn.Linear(ff_dim, d_model)
        self.dropout = nn.Dropout(dropout)
        self.relu = nn.ReLU()

    def forward(self, x):
        return self.linear_2(self.dropout(self.relu(self.linear_1(x))))

class TransformerEncoderBlock(nn.Module):
    def __init__(self, d_model, n_heads, ff_dim, dropout):
        super().__init__()
        self.attention = MultiHeadAttention(d_model, n_heads, dropout)
        self.norm1 = nn.LayerNorm(d_model)
        self.norm2 = nn.LayerNorm(d_model)
        self.ff = FeedForward(d_model, ff_dim, dropout)
        self.dropout = nn.Dropout(dropout)

    def forward(self, x, mask=None):
        attn_output = self.attention(x, mask)
        x = self.norm1(x + self.dropout(attn_output))
        ff_output = self.ff(x)
        return self.norm2(x + self.dropout(ff_output))
        
class PatchEmbeddingConv(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, d_model):
        super().__init__()
        self.num_patches = (image_size // patch_size) ** 2
        self.projection = nn.Conv2d(
            in_channels, d_model, kernel_size=patch_size, stride=patch_size
        )

    def forward(self, x):
        x = self.projection(x)
        x = x.flatten(2).transpose(1, 2)
        return x

class PatchEmbedding(nn.Module):
    """
    Converts an image into a sequence of flattened patch embeddings using einops.
    """
    def __init__(self, image_size, patch_size, in_channels, d_model):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (image_size // patch_size) ** 2

        # The size of a single flattened patch vector
        patch_vector_dim = patch_size * patch_size * in_channels
        
        # A standard Linear layer to project the flattened patch vector
        self.projection = nn.Linear(patch_vector_dim, d_model)

    def forward(self, x):
        # x shape: [batch_size, in_channels, height, width]
        x = rearrange(x, 'b c (h p1) (w p2) -> b (h w) (p1 p2 c)', 
                      p1=self.patch_size, p2=self.patch_size)

        return self.projection(x)
    
class VisionTransformer(nn.Module):
    def __init__(self, image_size, patch_size, in_channels, num_classes,
                 d_model, n_layers, n_heads, dropout):
        super().__init__()
        self.patch_embedding = PatchEmbedding(image_size, patch_size, in_channels, d_model)
        self.cls_token = nn.Parameter(torch.randn(1, 1, d_model))
        num_patches = self.patch_embedding.num_patches
        self.position_embedding = nn.Parameter(torch.randn(1, num_patches + 1, d_model))
        self.transformer_encoder = nn.ModuleList([
            TransformerEncoderBlock(d_model, n_heads, d_model * 4, dropout)
            for _ in range(n_layers)
        ])
        self.mlp_head = nn.Sequential(
            nn.LayerNorm(d_model),
            nn.Linear(d_model, num_classes)
        )
        self.dropout = nn.Dropout(dropout)

    def forward(self, x):
        batch_size = x.shape[0]
        x = self.patch_embedding(x)
        cls_tokens = self.cls_token.expand(batch_size, -1, -1)
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.position_embedding
        x = self.dropout(x)
        for layer in self.transformer_encoder:
            x = layer(x)
        cls_output = x[:, 0]
        return self.mlp_head(cls_output)

In [209]:
weights = 1 / (torch.bincount(torch.tensor(train_df['label'].values)) / len(train_df))

print('Class weights ')
for i, label in int_to_label.items():
  print(f'{label}: {weights[i]:.2f}')

Class weights 
ACK: 3.15
BCC: 2.72
MEL: 43.76
NEV: 9.43
SCC: 12.01
SEK: 9.78


In [210]:
# --- 5. Training and Evaluation Loop ---
model = VisionTransformer(
    image_size=CONFIG['image_size'],
    patch_size=CONFIG['patch_size'],
    in_channels=CONFIG['in_channels'],
    num_classes=CONFIG['num_classes'],
    d_model=CONFIG['d_model'],
    n_layers=CONFIG['n_layers'],
    n_heads=CONFIG['n_heads'],
    dropout=CONFIG['dropout']
).to(CONFIG['device'])

optimizer = torch.optim.AdamW(model.parameters(), lr=CONFIG['lr'])
weights = 1 / (torch.bincount(torch.tensor(train_df['label'].values)) / len(train_df)).to(CONFIG['device'])
criterion = nn.CrossEntropyLoss(weight=weights)

def train_epoch(model, iterator, optimizer, criterion):
    model.train()
    epoch_loss = 0
    for images, labels in iterator:
        images, labels = images.to(CONFIG['device']), labels.to(CONFIG['device'])
        
        output = model(images)
        loss = criterion(output, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        epoch_loss += loss.item()
    return epoch_loss / len(iterator)

def evaluate(model, iterator, criterion):
    model.eval()
    epoch_loss = 0
    all_preds, all_labels = [], []
    with torch.no_grad():
        for images, labels in iterator:
            images, labels = images.to(CONFIG['device']), labels.to(CONFIG['device'])
            
            output = model(images)
            loss = criterion(output, labels)
            epoch_loss += loss.item()

            preds = torch.argmax(output, dim=1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            
    accuracy = recall_score(all_labels, all_preds, average='macro')
    return epoch_loss / len(iterator), accuracy, all_preds, all_labels

print("\nStarting ViT training...")
# The loop below will only work if you have the images and have set the IMAGE_DIR correctly.
# It is commented out to prevent errors in this environment.
min_loss = float('inf')
for epoch in range(CONFIG['epochs']):
    train_loss = train_epoch(model, train_loader, optimizer, criterion)
    val_loss, val_accuracy, _, _ = evaluate(model, val_loader, criterion)
    if val_loss < min_loss:
        min_loss = val_loss
        torch.save(model.state_dict(), 'best_vit_model.pth')
    print(f"Epoch: {epoch+1:02} | Train Loss: {train_loss:.3f} | Val. Loss: {val_loss:.3f} | Val. Recall: {val_accuracy*100:.2f}%")

state_dict = torch.load('best_vit_model.pth')
model.load_state_dict(state_dict, strict=True)

print("\nFinal ViT Evaluation...")
# Note: This will only produce meaningful results after the training loop has run.
_, _, val_preds, val_labels = evaluate(model, val_loader, criterion)
report = classification_report(
    val_labels,
    val_preds,
    target_names=[int_to_label[i] for i in range(CONFIG['num_classes'])],
    zero_division=0
)

print(report)


Starting ViT training...


Epoch: 01 | Train Loss: 1.907 | Val. Loss: 1.854 | Val. Recall: 16.67%
Epoch: 02 | Train Loss: 1.829 | Val. Loss: 1.786 | Val. Recall: 20.77%
Epoch: 03 | Train Loss: 1.781 | Val. Loss: 1.718 | Val. Recall: 24.78%
Epoch: 04 | Train Loss: 1.683 | Val. Loss: 1.615 | Val. Recall: 30.40%
Epoch: 05 | Train Loss: 1.617 | Val. Loss: 1.469 | Val. Recall: 35.03%
Epoch: 06 | Train Loss: 1.512 | Val. Loss: 1.381 | Val. Recall: 39.83%
Epoch: 07 | Train Loss: 1.513 | Val. Loss: 1.362 | Val. Recall: 41.52%
Epoch: 08 | Train Loss: 1.365 | Val. Loss: 1.315 | Val. Recall: 51.72%
Epoch: 09 | Train Loss: 1.360 | Val. Loss: 1.302 | Val. Recall: 45.61%
Epoch: 10 | Train Loss: 1.313 | Val. Loss: 1.279 | Val. Recall: 51.44%
Epoch: 11 | Train Loss: 1.268 | Val. Loss: 1.237 | Val. Recall: 51.27%
Epoch: 12 | Train Loss: 1.272 | Val. Loss: 1.286 | Val. Recall: 44.95%
Epoch: 13 | Train Loss: 1.266 | Val. Loss: 1.305 | Val. Recall: 47.01%
Epoch: 14 | Train Loss: 1.271 | Val. Loss: 1.371 | Val. Recall: 46.42%
Epoch: