# MIL 모델: WSI 하나 전체를 입력 (Bag 단위)

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
import numpy as np
import requests
from io import BytesIO
import pandas as pd
import torchvision
import matplotlib.pyplot as plt
from PIL import Image
from torchvision import transforms
import torch.nn as nn
import torch.optim as optim
import random
from collections import defaultdict, Counter


In [None]:
def make_mil_data_index(repo_id, label_csv_path):
    df = pd.read_csv(label_csv_path)
    filename_to_label = dict(zip(df['pub_subspec_id'], df['label']))
    data_index = []
    for fname, label in filename_to_label.items():
        fname_with_ext = fname if fname.endswith(".npz") else f"{fname}.npz"
        url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{fname_with_ext}"
        data_index.append((url, label))
    return data_index


def stratified_split(data_index, train_ratio=0.7, val_ratio=0.15, seed=42):
    label_to_items = defaultdict(list)
    for item in data_index:
        label = item[2]
        label_to_items[label].append(item)

    train, val, test = [], [], []
    random.seed(seed)

    for label, items in label_to_items.items():
        random.shuffle(items)
        n_total = len(items)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * 0.15)
        train.extend(items[:n_train])
        val.extend(items[n_train:n_train + n_val])
        test.extend(items[n_train + n_val:])
    return train, val, test



In [None]:
class MILDataset(Dataset):
    def __init__(self, data_index, transform=None):
        self.data_index = data_index
        self.transform = transform

    def __len__(self):
        return len(self.data_index)

    def __getitem__(self, idx):
        url, label = self.data_index[idx]
        response = requests.get(url)
        npz = np.load(BytesIO(response.content))
        patches = []
        for key in npz.files:
            patch = npz[key]
            if patch.ndim == 2:
                patch = Image.fromarray(patch.astype(np.uint8), mode='L')
            elif patch.shape[-1] == 3:
                patch = Image.fromarray(patch.astype(np.uint8), mode='RGB')
            else:
                patch = Image.fromarray(patch.astype(np.uint8))
            if self.transform:
                patch = self.transform(patch)
            patches.append(patch)
        patch_tensor = torch.stack(patches)  # Shape: (N, C, H, W)
        return patch_tensor, int(label)


In [None]:
repo_id = "nayoungku1/npz-histopathology-dataset"
label_csv_path = "./metadata/label.csv"

transform = transforms.Compose([
    transforms.Resize((256, 256)),
    transforms.ToTensor()
])

all_index = make_mil_data_index(repo_id, label_csv_path)
train_idx, val_idx, test_idx = stratified_split(all_index)

train_dataset = MILDataset(train_idx, transform=transform)
val_dataset = MILDataset(val_idx, transform=transform)
test_dataset = MILDataset(test_idx, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8, shuffle=False)
test_loader = DataLoader(test_dataset, batch_size=8, shuffle=False)