In [None]:

import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms
from PIL import Image
import os
import numpy as np
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from collections import defaultdict
from tqdm import tqdm
import requests
from io import BytesIO


In [None]:

# CSV 파일 경로와 Huggingface repo_id
label_csv_path = "./label.csv"  # CSV 파일 경로 수정 필요
repo_id = "your-hf-repo-id"     # Huggingface repo 경로 수정 필요

df = pd.read_csv(label_csv_path)
df['pub_subspec_id'] = df['pub_subspec_id'].apply(lambda x: x if x.endswith('.npz') else f"{x}.npz")

# stratified 샘플링 (총 46개 샘플 선택)
_, selected_df = train_test_split(
    df,
    test_size=46,
    stratify=df['label'],
    random_state=42
)


In [None]:

def make_stratified_data_index(repo_id, filtered_df):
    filename_to_label = dict(zip(filtered_df['pub_subspec_id'], filtered_df['label']))
    data_index = []

    for fname, label in filename_to_label.items():
        url = f"https://huggingface.co/datasets/{repo_id}/resolve/main/{fname}"
        try:
            response = requests.get(url)
            npz = np.load(BytesIO(response.content))
            for key in npz.files:
                patch_array = npz[key]
                if patch_array.ndim == 4:
                    for i in range(patch_array.shape[0]):
                        data_index.append((url, key, i, label))
                else:
                    data_index.append((url, key, None, label))
        except Exception as e:
            print(f"❌ Failed to load {fname}: {e}")
    return data_index

data_index = make_stratified_data_index(repo_id, selected_df)


In [None]:

def stratified_split(data_index, train_ratio=0.7, val_ratio=0.15, seed=42):
    label_to_items = defaultdict(list)
    for item in data_index:
        label = item[3]
        label_to_items[label].append(item)

    train, val, test = [], [], []
    random.seed(seed)

    for label, items in label_to_items.items():
        random.shuffle(items)
        n_total = len(items)
        n_train = int(n_total * train_ratio)
        n_val = int(n_total * val_ratio)
        train.extend(items[:n_train])
        val.extend(items[n_train:n_train + n_val])
        test.extend(items[n_train + n_val:])
    return train, val, test

train_index, val_index, test_index = stratified_split(data_index)


In [None]:

class PatchDataset(Dataset):
    def __init__(self, data_index, transform=None):
        self.data_index = data_index
        self.transform = transform

    def __len__(self):
        return len(self.data_index)

    def __getitem__(self, idx):
        url, key, patch_idx, label = self.data_index[idx]
        response = requests.get(url)
        npz = np.load(BytesIO(response.content))
        patch_array = npz[key]

        patch = patch_array[patch_idx] if patch_idx is not None else patch_array
        patch = Image.fromarray(patch.astype(np.uint8))

        if self.transform:
            patch = self.transform(patch)

        return patch, int(label)


In [None]:

transform = transforms.Compose([
    transforms.Resize((128, 128)),
    transforms.ToTensor()
])

train_dataset = PatchDataset(train_index, transform)
val_dataset = PatchDataset(val_index, transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)


In [None]:

class CNNModel(nn.Module):
    def __init__(self):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(2, 2)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 32 * 32, 128)
        self.fc2 = nn.Linear(128, 1)
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.pool(torch.relu(self.conv1(x)))
        x = self.pool(torch.relu(self.conv2(x)))
        x = x.view(-1, 64 * 32 * 32)
        x = torch.relu(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        return x


In [None]:

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CNNModel().to(device)
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

epochs = 5
for epoch in range(epochs):
    model.train()
    total_loss = 0
    for images, labels in train_loader:
        images = images.to(device)
        labels = labels.float().unsqueeze(1).to(device)

        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        total_loss += loss.item()

    print(f"Epoch {epoch+1}/{epochs}, Loss: {total_loss:.4f}")
