In [6]:
!pip install tqdm

Collecting tqdm
  Downloading tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Downloading tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.67.1
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [11]:
!pip install kagglehub

Collecting kagglehub
  Downloading kagglehub-0.3.12-py3-none-any.whl.metadata (38 kB)
Downloading kagglehub-0.3.12-py3-none-any.whl (67 kB)
Installing collected packages: kagglehub
Successfully installed kagglehub-0.3.12
[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.0.1[0m[39;49m -> [0m[32;49m25.1.1[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m


In [36]:
# 0. Setup, download, and data loading via KaggleHub
import os
import random
import numpy as np
import pandas as pd
from PIL import Image
from torch.utils.data import Dataset, DataLoader, random_split, Subset
from torchvision import transforms, models
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from tqdm import tqdm
import kagglehub  # ensure kagglehub is installed and configured

# 0.1 Download the HAM10000 dataset via KaggleHub
path = kagglehub.dataset_download("kmader/skin-cancer-mnist-ham10000")
print("Dataset directory:", path)
print("Contents:", os.listdir(path))

# 0.2 Set seeds for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Using device:", device)

# 0.3 Identify image subdirectories
dirs = [
    "HAM10000_images_part_1","HAM10000_images_part_2",
    "ham10000_images_part_1","ham10000_images_part_2"
]
image_dirs = [os.path.join(path, d) for d in dirs if os.path.isdir(os.path.join(path, d))]
print("Found image directories:", image_dirs)

# 0.4 Path to metadata CSV
metadata_csv = os.path.join(path, "HAM10000_metadata.csv")
print("Metadata CSV:", metadata_csv)

# 0.5 Dataset class (no transform) for contrastive pairs
def make_raw_dataset(transform=None):
    class HAM10000Raw(Dataset):
        def __init__(self, metadata_csv, image_dirs):
            self.df = pd.read_csv(metadata_csv)
            self.image_dirs = image_dirs
            self.transform = transform
            self.classes = sorted(self.df['dx'].unique())
            self.class_to_idx = {c: i for i, c in enumerate(self.classes)}
        def __len__(self): return len(self.df)
        def __getitem__(self, idx):
            row = self.df.iloc[idx]
            img_id = row['image_id']
            img_path = next((os.path.join(d, f"{img_id}.jpg") for d in self.image_dirs
                            if os.path.isfile(os.path.join(d, f"{img_id}.jpg"))), None)
            if img_path is None:
                raise FileNotFoundError(f"Image for ID {img_id} not found")
            img = Image.open(img_path).convert('RGB')
            return img, self.class_to_idx[row['dx']]
    return HAM10000Raw(metadata_csv, image_dirs)

# 0.6 Raw dataset (PIL images) and Eval dataset (tensor transforms)
raw_dataset = make_raw_dataset()
basic_transform = transforms.Compose([
    transforms.Resize((224,224)),
    transforms.ToTensor(),
])

# We'll apply basic_transform later via a separate Dataset subclass

# 1. Balance top-2 classes on raw_dataset.df
dx_counts = raw_dataset.df['dx'].value_counts()
top2 = dx_counts.index[:2].tolist()
print('Selected classes:', top2)
min_ct = dx_counts[top2].min()
balanced_df = pd.concat([
    raw_dataset.df[raw_dataset.df['dx']==c].sample(min_ct, random_state=seed)
    for c in top2
]).sample(frac=1, random_state=seed).reset_index(drop=True)
# Assign balanced df to raw_dataset
raw_dataset.df = balanced_df
raw_dataset.classes = top2
raw_dataset.class_to_idx = {c:i for i,c in enumerate(top2)}

# 2. Contrastive dataset with two augmentations
aug_transform = transforms.Compose([
    transforms.RandomResizedCrop(224),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor()
])
class ContrastiveDataset(Dataset):
    def __init__(self, dataset, transform):
        self.ds = dataset
        self.transform = transform
    def __len__(self): return len(self.ds)
    def __getitem__(self, idx):
        img, label = self.ds[idx]
        xi = self.transform(img)
        xj = self.transform(img)
        return xi, xj, label

contrast_ds = ContrastiveDataset(raw_dataset, aug_transform)
n = len(contrast_ds)
train_n = int(0.8 * n)
test_n  = n - train_n
train_ds, test_ds = random_split(contrast_ds, [train_n, test_n],
                                 generator=torch.Generator().manual_seed(seed))
train_loader = DataLoader(train_ds, batch_size=32, shuffle=True, num_workers=4)
test_loader  = DataLoader(test_ds,  batch_size=32, shuffle=False, num_workers=4)

# 3. Define SimCLR components
class ProjectionHead(nn.Module):
    def __init__(self, in_dim, hidden_dim, out_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(in_dim, hidden_dim),
            nn.ReLU(inplace=True),
            nn.Linear(hidden_dim, out_dim)
        )
    def forward(self, x): return self.net(x)

class SimCLR(nn.Module):
    def __init__(self, base_encoder, projection_dim=128, hidden_dim=512):
        super().__init__()
        self.encoder = base_encoder
        feat_dim = self.encoder.fc.in_features
        self.encoder.fc = nn.Identity()
        self.proj_head = ProjectionHead(feat_dim, hidden_dim, projection_dim)
    def forward(self, x):
        h = self.encoder(x)
        z = self.proj_head(h)
        return F.normalize(h, dim=1), F.normalize(z, dim=1)

# NT-Xent loss
def nt_xent_loss(z_i, z_j, temperature=0.5):
    N = z_i.size(0)
    # concatenate embeddings
    z = torch.cat([z_i, z_j], dim=0)  # 2N x D
    # similarity matrix
    sim_mat = F.cosine_similarity(z.unsqueeze(1), z.unsqueeze(0), dim=2) / temperature
    # mask self-similarities
    mask = torch.ones((2*N, 2*N), device=device) - torch.eye(2*N, device=device)
    # exponentiate similarities
    exp_sim = torch.exp(sim_mat) * mask
    # denominator: sum over rows
    denom = exp_sim.sum(dim=1)
    # positive pairs: i->j and j->i
    sim_ij = torch.exp(F.cosine_similarity(z_i, z_j) / temperature)
    sim_ji = torch.exp(F.cosine_similarity(z_j, z_i) / temperature)
    # compute loss
    loss = -torch.log(sim_ij / denom[:N]) - torch.log(sim_ji / denom[N:])
    return loss.mean()

# 4. Train SimCLR encoder
projection_dim = 4
encoder = models.resnet18(weights=None).to(device)
model   = SimCLR(encoder, projection_dim=projection_dim, hidden_dim=512).to(device)
opt     = optim.Adam(model.parameters(), lr=1e-3)
epochs  = 50
for epoch in range(epochs):
    model.train()
    tot_loss = 0
    for xi, xj, _ in tqdm(train_loader, desc=f"SimCLR Epoch {epoch+1}/{epochs}"):
        xi, xj = xi.to(device), xj.to(device)
        _, zi = model(xi)
        _, zj = model(xj)
        loss = nt_xent_loss(zi, zj)
        opt.zero_grad(); loss.backward(); opt.step()
        tot_loss += loss.item()
    print(f"Epoch {epoch+1}, Loss: {tot_loss/len(train_loader):.4f}")

# 5. Prepare evaluation dataset using basic_transform
class EvalDataset(Dataset):
    def __init__(self, metadata, image_dirs, transform, target_df):
        self.df = target_df
        self.image_dirs = image_dirs
        self.transform = transform
        self.class_to_idx = raw_dataset.class_to_idx
    def __len__(self): return len(self.df)
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        img_id = row['image_id']
        img_path = next((os.path.join(d, f"{img_id}.jpg") for d in image_dirs
                         if os.path.isfile(os.path.join(d, f"{img_id}.jpg"))), None)
        img = Image.open(img_path).convert('RGB')
        img = self.transform(img)
        lbl = self.class_to_idx[row['dx']]
        return img, lbl

# Build eval datasets matching train/test splits
eval_dataset = EvalDataset(metadata_csv, image_dirs, basic_transform, balanced_df)
train_idx = train_ds.indices
test_idx  = test_ds.indices
eval_train = Subset(eval_dataset, train_idx)
eval_test  = Subset(eval_dataset, test_idx)
eval_train_loader = DataLoader(eval_train, batch_size=32, shuffle=False, num_workers=4)
eval_test_loader  = DataLoader(eval_test,  batch_size=32, shuffle=False, num_workers=4)

# 6. Extract embeddings for eval sets
def extract_embeddings(loader):
    model.eval()
    embs, lbls = [], []
    with torch.no_grad():
        for imgs, labels in tqdm(loader, desc="Extract Embs"):
            imgs = imgs.to(device)
            _, z = model(imgs)
            embs.append(z.cpu()); lbls.append(labels)
    return torch.cat(embs), torch.cat(lbls)

train_embs, train_lbls = extract_embeddings(eval_train_loader)
test_embs,  test_lbls  = extract_embeddings(eval_test_loader)

# 7. Define and train linear classifier
class LinearClassifier(nn.Module):
    def __init__(self, in_dim, num_classes): super().__init__(); self.fc = nn.Linear(in_dim, num_classes)
    def forward(self, x): return self.fc(x)

clf   = LinearClassifier(projection_dim, 2).to(device)
opt_c = optim.Adam(clf.parameters(), lr=1e-3)
crit  = nn.CrossEntropyLoss()
c_epochs = 5
for e in range(c_epochs:=5):
    clf.train(); tot=0; perm=torch.randperm(train_embs.size(0))
    for i in range(0, len(perm), 32):
        idx=perm[i:i+32]
        out=clf(train_embs[idx].to(device))
        loss=crit(out,train_lbls[idx].to(device))
        opt_c.zero_grad(); loss.backward(); opt_c.step()
        tot+=loss.item()
    print(f"Classifier Epoch {e+1}, Loss: {tot/(len(perm)/32):.4f}")

# 8. Evaluate classifier accuracy
def eval_acc(loader):
    clf.eval(); correct=total=0
    with torch.no_grad():
        for emb,label in loader:
            preds=clf(emb.to(device)).argmax(dim=1).cpu()
            correct+=(preds==label).sum().item()
            total+=len(label)
    return correct/total

train_ld = DataLoader(torch.utils.data.TensorDataset(train_embs,train_lbls), batch_size=32)
test_ld  = DataLoader(torch.utils.data.TensorDataset(test_embs, test_lbls),  batch_size=32)
print("Train Acc:", eval_acc(train_ld))
print("Test  Acc:", eval_acc(test_ld))

# Class-wise accuracies
def eval_per_class(loader, num_classes):
    clf.eval()
    correct = [0]*num_classes
    total   = [0]*num_classes
    with torch.no_grad():
        for emb, label in loader:
            preds = clf(emb.to(device)).argmax(dim=1).cpu()
            for p, l in zip(preds, label):
                total[l] += 1
                if p == l:
                    correct[l] += 1
    return [correct[i]/total[i] if total[i]>0 else 0.0 for i in range(num_classes)]

# Assuming two classes
num_classes = 2
train_cw = eval_per_class(train_ld, num_classes)
test_cw  = eval_per_class(test_ld,  num_classes)

# Map indices back to class names
idx_to_class = {v:k for k,v in raw_dataset.class_to_idx.items()}
train_cw_named = {idx_to_class[i]: train_cw[i] for i in range(num_classes)}
test_cw_named  = {idx_to_class[i]: test_cw[i]  for i in range(num_classes)}

print("Train class-wise accuracy:", train_cw_named)
print("Test  class-wise accuracy:", test_cw_named)

# 9. Save all embeddings and labels in a single NPZ file
# Concatenate train and test embeddings and labels
every_emb = torch.cat([train_embs, test_embs], dim=0)
every_lbl = torch.cat([train_lbls, test_lbls], dim=0)
# Convert to numpy arrays
emb_np = every_emb.cpu().numpy()
lbl_np = every_lbl.cpu().numpy()
# Save to compressed NPZ
output_file = 'ham10000_embeddings.npz'
np.savez_compressed(output_file, embeddings=emb_np, labels=lbl_np)
print(f"Saved all embeddings and labels to {output_file} (compressed NPZ)")

Dataset directory: /root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2
Contents: ['HAM10000_images_part_1', 'HAM10000_images_part_2', 'HAM10000_metadata.csv', 'ham10000_images_part_1', 'ham10000_images_part_2', 'hmnist_28_28_L.csv', 'hmnist_28_28_RGB.csv', 'hmnist_8_8_L.csv', 'hmnist_8_8_RGB.csv']
Using device: cuda
Found image directories: ['/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_part_1', '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_images_part_2', '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/ham10000_images_part_1', '/root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/ham10000_images_part_2']
Metadata CSV: /root/.cache/kagglehub/datasets/kmader/skin-cancer-mnist-ham10000/versions/2/HAM10000_metadata.csv
Selected classes: ['nv', 'mel']


SimCLR Epoch 1/50: 100%|██████████| 56/56 [00:06<00:00,  9.09it/s]


Epoch 1, Loss: 6.1963


SimCLR Epoch 2/50: 100%|██████████| 56/56 [00:05<00:00,  9.98it/s]


Epoch 2, Loss: 5.6508


SimCLR Epoch 3/50: 100%|██████████| 56/56 [00:05<00:00,  9.98it/s]


Epoch 3, Loss: 5.5951


SimCLR Epoch 4/50: 100%|██████████| 56/56 [00:05<00:00,  9.48it/s]


Epoch 4, Loss: 5.5161


SimCLR Epoch 5/50: 100%|██████████| 56/56 [00:05<00:00, 10.02it/s]


Epoch 5, Loss: 5.5622


SimCLR Epoch 6/50: 100%|██████████| 56/56 [00:05<00:00, 10.00it/s]


Epoch 6, Loss: 5.4811


SimCLR Epoch 7/50: 100%|██████████| 56/56 [00:05<00:00,  9.71it/s]


Epoch 7, Loss: 5.4625


SimCLR Epoch 8/50: 100%|██████████| 56/56 [00:05<00:00,  9.92it/s]


Epoch 8, Loss: 5.4544


SimCLR Epoch 9/50: 100%|██████████| 56/56 [00:05<00:00,  9.36it/s]


Epoch 9, Loss: 5.4203


SimCLR Epoch 10/50: 100%|██████████| 56/56 [00:06<00:00,  8.99it/s]


Epoch 10, Loss: 5.4047


SimCLR Epoch 11/50: 100%|██████████| 56/56 [00:05<00:00,  9.76it/s]


Epoch 11, Loss: 5.4091


SimCLR Epoch 12/50: 100%|██████████| 56/56 [00:06<00:00,  8.03it/s]


Epoch 12, Loss: 5.3975


SimCLR Epoch 13/50: 100%|██████████| 56/56 [00:05<00:00,  9.62it/s]


Epoch 13, Loss: 5.3803


SimCLR Epoch 14/50: 100%|██████████| 56/56 [00:05<00:00,  9.97it/s]


Epoch 14, Loss: 5.3899


SimCLR Epoch 15/50: 100%|██████████| 56/56 [00:05<00:00,  9.92it/s]


Epoch 15, Loss: 5.3985


SimCLR Epoch 16/50: 100%|██████████| 56/56 [00:05<00:00,  9.98it/s]


Epoch 16, Loss: 5.3792


SimCLR Epoch 17/50: 100%|██████████| 56/56 [00:06<00:00,  8.81it/s]


Epoch 17, Loss: 5.3902


SimCLR Epoch 18/50: 100%|██████████| 56/56 [00:05<00:00,  9.56it/s]


Epoch 18, Loss: 5.3705


SimCLR Epoch 19/50: 100%|██████████| 56/56 [00:05<00:00,  9.98it/s]


Epoch 19, Loss: 5.3615


SimCLR Epoch 20/50: 100%|██████████| 56/56 [00:05<00:00,  9.92it/s]


Epoch 20, Loss: 5.3584


SimCLR Epoch 21/50: 100%|██████████| 56/56 [00:06<00:00,  8.49it/s]


Epoch 21, Loss: 5.3744


SimCLR Epoch 22/50: 100%|██████████| 56/56 [00:06<00:00,  8.36it/s]


Epoch 22, Loss: 5.3594


SimCLR Epoch 23/50: 100%|██████████| 56/56 [00:05<00:00,  9.60it/s]


Epoch 23, Loss: 5.3901


SimCLR Epoch 24/50: 100%|██████████| 56/56 [00:05<00:00,  9.55it/s]


Epoch 24, Loss: 5.3552


SimCLR Epoch 25/50: 100%|██████████| 56/56 [00:05<00:00,  9.82it/s]


Epoch 25, Loss: 5.3724


SimCLR Epoch 26/50: 100%|██████████| 56/56 [00:06<00:00,  8.61it/s]


Epoch 26, Loss: 5.3566


SimCLR Epoch 27/50: 100%|██████████| 56/56 [00:05<00:00,  9.89it/s]


Epoch 27, Loss: 5.3573


SimCLR Epoch 28/50: 100%|██████████| 56/56 [00:05<00:00,  9.91it/s]


Epoch 28, Loss: 5.3545


SimCLR Epoch 29/50: 100%|██████████| 56/56 [00:05<00:00,  9.52it/s]


Epoch 29, Loss: 5.3609


SimCLR Epoch 30/50: 100%|██████████| 56/56 [00:05<00:00,  9.70it/s]


Epoch 30, Loss: 5.3582


SimCLR Epoch 31/50: 100%|██████████| 56/56 [00:06<00:00,  8.77it/s]


Epoch 31, Loss: 5.3461


SimCLR Epoch 32/50: 100%|██████████| 56/56 [00:05<00:00,  9.52it/s]


Epoch 32, Loss: 5.3494


SimCLR Epoch 33/50: 100%|██████████| 56/56 [00:05<00:00,  9.73it/s]


Epoch 33, Loss: 5.3525


SimCLR Epoch 34/50: 100%|██████████| 56/56 [00:06<00:00,  8.46it/s]


Epoch 34, Loss: 5.3614


SimCLR Epoch 35/50: 100%|██████████| 56/56 [00:06<00:00,  8.50it/s]


Epoch 35, Loss: 5.3627


SimCLR Epoch 36/50: 100%|██████████| 56/56 [00:06<00:00,  8.96it/s]


Epoch 36, Loss: 5.3495


SimCLR Epoch 37/50: 100%|██████████| 56/56 [00:05<00:00,  9.89it/s]


Epoch 37, Loss: 5.3566


SimCLR Epoch 38/50: 100%|██████████| 56/56 [00:06<00:00,  8.19it/s]


Epoch 38, Loss: 5.3378


SimCLR Epoch 39/50: 100%|██████████| 56/56 [00:05<00:00,  9.89it/s]


Epoch 39, Loss: 5.3559


SimCLR Epoch 40/50: 100%|██████████| 56/56 [00:06<00:00,  9.28it/s]


Epoch 40, Loss: 5.3427


SimCLR Epoch 41/50: 100%|██████████| 56/56 [00:06<00:00,  9.02it/s]


Epoch 41, Loss: 5.3464


SimCLR Epoch 42/50: 100%|██████████| 56/56 [00:06<00:00,  8.97it/s]


Epoch 42, Loss: 5.3481


SimCLR Epoch 43/50: 100%|██████████| 56/56 [00:05<00:00,  9.62it/s]


Epoch 43, Loss: 5.3420


SimCLR Epoch 44/50: 100%|██████████| 56/56 [00:06<00:00,  8.42it/s]


Epoch 44, Loss: 5.3421


SimCLR Epoch 45/50: 100%|██████████| 56/56 [00:05<00:00,  9.88it/s]


Epoch 45, Loss: 5.3528


SimCLR Epoch 46/50: 100%|██████████| 56/56 [00:06<00:00,  8.49it/s]


Epoch 46, Loss: 5.3527


SimCLR Epoch 47/50: 100%|██████████| 56/56 [00:06<00:00,  9.32it/s]


Epoch 47, Loss: 5.3451


SimCLR Epoch 48/50: 100%|██████████| 56/56 [00:06<00:00,  8.74it/s]


Epoch 48, Loss: 5.3380


SimCLR Epoch 49/50: 100%|██████████| 56/56 [00:05<00:00,  9.73it/s]


Epoch 49, Loss: 5.3585


SimCLR Epoch 50/50: 100%|██████████| 56/56 [00:05<00:00,  9.41it/s]


Epoch 50, Loss: 5.3505


Extract Embs: 100%|██████████| 56/56 [00:03<00:00, 18.25it/s]
Extract Embs: 100%|██████████| 14/14 [00:01<00:00, 13.40it/s]


Classifier Epoch 1, Loss: 0.7430
Classifier Epoch 2, Loss: 0.7108
Classifier Epoch 3, Loss: 0.6840
Classifier Epoch 4, Loss: 0.6625
Classifier Epoch 5, Loss: 0.6438
Train Acc: 0.699438202247191
Test  Acc: 0.726457399103139
Train class-wise accuracy: {'nv': 0.7222222222222222, 'mel': 0.6770601336302895}
Test  class-wise accuracy: {'nv': 0.7316017316017316, 'mel': 0.7209302325581395}
Saved all embeddings and labels to ham10000_embeddings.npz (compressed NPZ)
