In [1]:
# Uncomment for Google Colab authentication
# from google.colab import auth
# auth.authenticate_user()

In [2]:
# Import modular components
import logging
import pandas as pd
import numpy as np

from config import (GCS_BUCKET_NAME, GCS_METADATA_PATH, GCS_IMAGE_PREFIX, LOCAL_METADATA_PATH, LOCAL_DATA_DIR)
from utils import load_metadata, logger, stratified_split
from utils import get_basic_transform, get_train_transform, get_test_valid_transform
from dataloader import create_dataloaders, ImageDataset

# Set logging level
logger.setLevel(logging.INFO)
print("✓ All modules imported successfully!")

✓ All modules imported successfully!


In [3]:
metadata = load_metadata(LOCAL_METADATA_PATH, min_samples=10)
local_img_path = '../../data/imgs'

# Create dataloaders (returns 4 values: train, val, test, info)
train_loader, val_loader, test_loader, info = create_dataloaders(
    metadata_df=metadata,
    img_prefix=local_img_path,
    img_col='filename',
    label_col='label',
    test_size=0.2,
    compute_stats=True,
    weighted_sampling=True
)

print(f"\n{'='*50}")
print("DataLoader Created!")
print(f"{'='*50}")
print(f"Classes: {info['num_classes']}")
print(f"Training samples: {info['train_size']:,}")
print(f"Test samples: {info['test_size']:,}")
print(f"Train batches: {len(train_loader)}")
print(f"Test batches: {len(test_loader)}")
print(f"Mean: {[f'{m:.4f}' for m in info['mean']]}")
print(f"Std: {[f'{s:.4f}' for s in info['std']]}")
print(f"{'='*50}")

2025-10-14 11:10:26,857 - utils - INFO - Loading metadata from: ../../data/metadata_all_harmonized.csv
2025-10-14 11:10:26,896 - utils - INFO - Total images loaded: 16,387
2025-10-14 11:10:26,897 - utils - INFO - Labels with >= 10 images: 104
2025-10-14 11:10:26,899 - utils - INFO - Images after filtering: 16,295
2025-10-14 11:10:26,900 - utils - INFO - Creating DataLoaders
2025-10-14 11:10:26,911 - utils - INFO - Train samples: 13,036, Test samples: 3,259
2025-10-14 11:10:26,913 - utils - INFO - Total classes: 104
2025-10-14 11:10:26,914 - utils - INFO - Computing dataset statistics from all training data
2025-10-14 11:10:26,914 - utils - INFO - Using local storage: ../../data/imgs
2025-10-14 11:10:26,915 - utils - INFO - Computing dataset statistics...
Computing stats: 100%|██████████| 408/408 [01:06<00:00,  6.10it/s]
2025-10-14 11:11:33,822 - utils - INFO - Mean: [0.6552238464355469, 0.49825090169906616, 0.4248701333999634]
2025-10-14 11:11:33,823 - utils - INFO - Std: [0.1202227100


DataLoader Created!
Classes: 104
Training samples: 13,036
Test samples: 3,259
Train batches: 407
Test batches: 102
Mean: ['0.6552', '0.4983', '0.4249']
Std: ['0.1202', '0.1225', '0.1229']


In [4]:
images, labels = next(iter(train_loader))

print(f"Batch loaded successfully!")
print(f"Images shape: {images.shape}")
print(f"Labels shape: {labels.shape}")
print(f"Unique classes in batch: {labels.unique().tolist()}")

Batch loaded successfully!
Images shape: torch.Size([32, 3, 224, 224])
Labels shape: torch.Size([32])
Unique classes in batch: [0, 11, 16, 18, 19, 24, 28, 29, 30, 31, 37, 39, 43, 58, 60, 67, 69, 74, 76, 80, 82, 84, 89, 99]


In [18]:
# === GCS TEST (small stratified fraction) ===

# Load metadata from GCS
gcs_path = f"gs://{GCS_BUCKET_NAME}/{GCS_METADATA_PATH}"
metadata_gcs = load_metadata(gcs_path, min_samples=10)

sample_frac = 0.25  # 10% of data
metadata_gcs_small = (metadata_gcs.groupby('label', group_keys=False).sample(frac=sample_frac, random_state=42).reset_index(drop=True))

# Create dataloaders using GCS images
train_loader_gcs, val_loader_gcs, test_loader_gcs, info_gcs = create_dataloaders(
    metadata_df=metadata_gcs_small,
    img_prefix=GCS_IMAGE_PREFIX,  # gs:// path
    img_col='filename',
    label_col='label',
    test_size=0.2,
    compute_stats=True,
    weighted_sampling=True
)

print(f"\n{'='*50}")
print("GCS DataLoader (sampled) Created!")
print(f"{'='*50}")
print(f"Classes: {info_gcs['num_classes']}")
print(f"Training samples: {info_gcs['train_size']:,}")
print(f"Test samples: {info_gcs['test_size']:,}")
print(f"Train batches: {len(train_loader_gcs)}")
print(f"Test batches: {len(test_loader_gcs)}")
print(f"Mean: {[f'{m:.4f}' for m in info_gcs['mean']]}")
print(f"Std: {[f'{s:.4f}' for s in info_gcs['std']]}")
print(f"{'='*50}")


2025-10-14 11:14:35,302 - utils - INFO - Loading metadata from: gs://derma-datasets-2/final/metadata_all_harmonized.csv
2025-10-14 11:14:35,740 - utils - INFO - Total images loaded: 16,387
2025-10-14 11:14:35,741 - utils - INFO - Labels with >= 10 images: 104
2025-10-14 11:14:35,744 - utils - INFO - Images after filtering: 16,295
2025-10-14 11:14:35,750 - utils - INFO - Creating DataLoaders
2025-10-14 11:14:35,754 - utils - INFO - Train samples: 3,256, Test samples: 815
2025-10-14 11:14:35,756 - utils - INFO - Total classes: 104
2025-10-14 11:14:35,756 - utils - INFO - Computing dataset statistics from all training data
2025-10-14 11:14:35,756 - utils - INFO - Using GCS storage
2025-10-14 11:14:35,757 - utils - INFO - Computing dataset statistics...
Computing stats: 100%|██████████| 102/102 [05:15<00:00,  3.09s/it]
2025-10-14 11:19:50,952 - utils - INFO - Mean: [0.6562517285346985, 0.49849268794059753, 0.42604541778564453]
2025-10-14 11:19:50,953 - utils - INFO - Std: [0.11991573125123


GCS DataLoader (sampled) Created!
Classes: 104
Training samples: 3,256
Test samples: 815
Train batches: 101
Test batches: 26
Mean: ['0.6563', '0.4985', '0.4260']
Std: ['0.1199', '0.1215', '0.1220']
