# Hugging Face NPZ Dataset with Train/Val/Test Split

This notebook contains the entire pipeline for training a PyTorch model based on a `.npz` image patch file stored in Hugging Face and a local `metadata/label.csv`. It also splits it into train/validation/test.

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import requests
from io import BytesIO
import pandas as pd
import torchvision
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import random
from collections import defaultdict

## Define dataset index creation function and split function

In [None]:
def make_data_index(repo_id, label_csv_path):
    df = pd.read_csv(label_csv_path)
    filename_to_label = dict(zip(df['pub_subspec_id'], df['label']))
    data_index = []

    for fname, label in filename_to_label.items():
        fname_with_ext = fname if fname.endswith(".npz") else f"{fname}.npz"
        url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{fname_with_ext}"
        try:
            response = requests.get(url)
            npz = np.load(BytesIO(response.content))
            for key in npz.files:
                data_index.append((url, key, label))
        except Exception as e:
            print(f"❌ Failed to load {fname_with_ext}: {e}")
    return data_index

def stratified_split(data_index, train_ratio=0.7, val_ratio=0.15, seed=316):
    label_to_items = defaultdict(list)
    for item in data_index:
        label = item[2]
        label_to_items[label].append(item)

    train, val, test = [], [], []
    random.seed(seed)

    for label, items in label_to_items.items():
        random.shuffle(items)
        n_total = len(items)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)
        train.extend(items[:n_train])
        val.extend(items[n_train:n_train + n_val])
        test.extend(items[n_train + n_val:])

    return train, val, test


## Dataset class definition

In [3]:
class HuggingFaceNPZWithLabelDataset(Dataset):
    def __init__(self, data_index, transform=None):
        self.data_index = data_index
        self.transform = transform

    def __len__(self):
        return len(self.data_index)

    def __getitem__(self, idx):
        url, key, label = self.data_index[idx]
        response = requests.get(url)
        npz = np.load(BytesIO(response.content))
        patch = npz[key]
        if patch.ndim == 2:
            patch = Image.fromarray(patch.astype(np.uint8), mode='L')
        elif patch.shape[-1] == 3:
            patch = Image.fromarray(patch.astype(np.uint8), mode='RGB')
        else:
            patch = Image.fromarray(patch.astype(np.uint8))
        if self.transform:
            patch = self.transform(patch)
        return patch, int(label)


## Load and split the dataset

In [None]:
repo_id = "nayoungku1/npz-histopathology-dataset"
label_csv_path = "./metadata/label.csv"

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])

all_index = make_data_index(repo_id, label_csv_path)
train_index, val_index, test_index = stratified_split(all_index, train_ratio=0.7, val_ratio=0.15, seed=314)

train_dataset = HuggingFaceNPZWithLabelDataset(train_index, transform=transform)
val_dataset = HuggingFaceNPZWithLabelDataset(val_index, transform=transform)
test_dataset = HuggingFaceNPZWithLabelDataset(test_index, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)


## Simple CNN model definition and training loop

In [None]:
class SimpleCNN(nn.Module):
    def __init__(self, num_classes):
        super(SimpleCNN, self).__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(3, 16, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
            nn.Conv2d(16, 32, 3, padding=1),
            nn.ReLU(),
            nn.MaxPool2d(2),
        )
        self.fc = nn.Sequential(
            nn.Flatten(),
            nn.Linear(32 * 56 * 56, 128),
            nn.ReLU(),
            nn.Linear(128, num_classes)
        )

    def forward(self, x):
        return self.fc(self.conv(x))

num_classes = len(pd.read_csv(label_csv_path)['label'].unique())
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = SimpleCNN(num_classes).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


## Run learning loop (train + validation)

In [None]:
epochs = 3

for epoch in range(epochs):
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        _, predicted = torch.max(outputs, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    train_acc = 100 * correct / total

    # Validation
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    val_acc = 100 * correct / total

    print(f"Epoch {epoch+1}, Loss: {running_loss:.4f}, Train Acc: {train_acc:.2f}%, Val Acc: {val_acc:.2f}%")
