In [1]:
# Import modular components
import logging
import pandas as pd
import numpy as np

# Add parent directory to path
import sys
from pathlib import Path
sys.path.insert(0, str(Path().absolute().parent))

from constants import (GCS_BUCKET_NAME, GCS_METADATA_PATH, GCS_IMAGE_PREFIX, 
                       LOCAL_METADATA_PATH, LOCAL_DATA_DIR)
from utils import load_metadata, logger, stratified_split
from dataloader import (create_dataloaders, ImageDataset, 
                       get_basic_transform, get_train_transform, get_test_valid_transform)

# Set logging level
logger.setLevel(logging.INFO)
print("✓ All modules imported successfully!")

✓ All modules imported successfully!


In [None]:
# === GCS TEST (small stratified fraction) ===

# Load metadata from GCS
gcs_path = f"{GCS_METADATA_PATH}"
metadata_gcs = load_metadata(gcs_path, min_samples=10)

sample_frac = 0.25  # 25% of data
metadata_gcs_small = (metadata_gcs.groupby('label', group_keys=False)
                      .sample(frac=sample_frac, random_state=42)
                      .reset_index(drop=True))

# Create config dictionaries for GCS dataloader
data_config = {'use_local': False, 'img_prefix': ''}
training_config = {'batch_size': 32, 'num_workers': 8, 'seed': 42, 'prefetch_factor': 2}
splits_config = {'test_size': 0.2, 'val_size': None} 
image_config = {'size': [224, 224]}
data_processing_config = {'compute_stats': True, 'weighted_sampling': True, 'skip_errors': True}

augmentation_config = {
    'brightness_jitter': 0.1,
    'contrast_jitter': 0.1,
    'saturation_jitter': 0.1,
    'hue_jitter': 0.05,
    'rotation_degrees': 20,
    'translate': [0.1, 0.1],
    'scale': [0.9, 1.1],
    'grayscale_prob': 0.1,
    'horizontal_flip_prob': 0.5,
    'vertical_flip_prob': 0.5
}

# Create dataloaders using GCS images
train_loader_gcs, val_loader_gcs, test_loader_gcs, info_gcs = create_dataloaders(
    metadata_df=metadata_gcs_small,
    img_prefix=GCS_IMAGE_PREFIX,  # gs:// path
    data_config=data_config,
    training_config=training_config,
    splits_config=splits_config,
    image_config=image_config,
    data_processing_config=data_processing_config,
    augmentation_config=augmentation_config
)

print(f"\n{'='*50}")
print("GCS DataLoader (sampled) Created!")
print(f"{'='*50}")
print(f"Classes: {info_gcs['num_classes']}")
print(f"Training samples: {info_gcs['train_size']:,}")
print(f"Test samples: {info_gcs['test_size']:,}")
print(f"Train batches: {len(train_loader_gcs)}")
print(f"Test batches: {len(test_loader_gcs)}")
print(f"Mean: {[f'{m:.4f}' for m in info_gcs['mean']]}")
print(f"Std: {[f'{s:.4f}' for s in info_gcs['std']]}")
print(f"{'='*50}")

2025-10-14 21:10:01,402 - utils - INFO - Loading metadata from: gs://derma-datasets-2/final/metadata_all_harmonized.csv
2025-10-14 21:10:01,832 - utils - INFO - Total images loaded: 16,387
2025-10-14 21:10:01,833 - utils - INFO - Labels with >= 10 images: 104
2025-10-14 21:10:01,836 - utils - INFO - Images after filtering: 16,295
2025-10-14 21:10:01,841 - utils - INFO - Creating DataLoaders
2025-10-14 21:10:01,845 - utils - INFO - Train samples: 3,256, Test samples: 815
2025-10-14 21:10:01,847 - utils - INFO - Total classes: 104
2025-10-14 21:10:01,847 - utils - INFO - Computing dataset statistics from all training data
2025-10-14 21:10:01,847 - utils - INFO - Using GCS storage
2025-10-14 21:10:01,848 - utils - INFO - Computing dataset statistics...
Computing stats:   0%|          | 0/102 [00:00<?, ?it/s]