<a href="https://colab.research.google.com/github/marcvonrohr/DeepLearning/blob/main/meta_learning_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import os
import time
import json
import random
from google.colab import drive

#################################################################
#  STEP 2.1: PREPARE LOCAL VM
#################################################################

# --- 1. Mount Google Drive ---
print("Connecting Google Drive...")
drive.mount('/content/drive')
print("...Google Drive connected.")

# --- 2. Define Key Paths ---
GDRIVE_ROOT = '/content/drive/MyDrive/'
PROJECT_DIR = os.path.join(GDRIVE_ROOT, 'Deep Learning')
DATASETS_ROOT_DIR = os.path.join(PROJECT_DIR, 'datasets')
INAT_ROOT_DIR = os.path.join(DATASETS_ROOT_DIR, 'inaturalist')

# Source: The COMPRESSED archives
ARCHIVES_DIR_ON_DRIVE = os.path.join(INAT_ROOT_DIR, 'archives')

# Target: The LOCAL VM fast disk
LOCAL_DATA_ROOT = '/content/data'
# This is the final path your PyTorch code will use:
FINAL_DATA_PATH = os.path.join(LOCAL_DATA_ROOT, 'inaturalist_unpacked')

# Define source/destination paths
TAR_FILES = {
    "2021_train_mini": {
        "src": os.path.join(ARCHIVES_DIR_ON_DRIVE, '2021_train_mini.tar.gz'),
        "dest_tar": os.path.join(LOCAL_DATA_ROOT, '2021_train_mini.tar.gz'),
        "check_unpacked": os.path.join(FINAL_DATA_PATH, '2021_train_mini')
    },
    "2021_valid": {
        "src": os.path.join(ARCHIVES_DIR_ON_DRIVE, '2021_valid.tar.gz'),
        "dest_tar": os.path.join(LOCAL_DATA_ROOT, '2021_valid.tar.gz'),
        "check_unpacked": os.path.join(FINAL_DATA_PATH, '2021_valid')
    }
}

# --- 3. Create Local Directories on VM ---
os.makedirs(LOCAL_DATA_ROOT, exist_ok=True)
os.makedirs(FINAL_DATA_PATH, exist_ok=True)
print(f"Local data directory created at: {FINAL_DATA_PATH}")

# --- 4. Copy, Unpack, and Clean up for each file ---
for name, paths in TAR_FILES.items():
    print(f"\n--- Processing {name} ---")

    if os.path.exists(paths["check_unpacked"]):
        print(f"'{name}' is already unpacked in local VM. Skipping.")
        continue

    # 4a. Copy .tar.gz from Drive to local VM
    print(f"Copying '{name}.tar.gz' from Drive to local VM...")
    start_time = time.time()
    !cp "{paths['src']}" "{paths['dest_tar']}"
    print(f"...Copy complete. Took {time.time() - start_time:.2f} seconds.")

    # 4b. Unpack the file on the local VM
    print(f"Unpacking '{name}.tar.gz' locally...")
    start_time = time.time()
    !tar -xzf "{paths['dest_tar']}" -C "{FINAL_DATA_PATH}"
    print(f"...Unpacking complete. Took {time.time() - start_time:.2f} seconds.")

    # 4c. Delete the local .tar.gz file to save VM space
    print(f"Deleting local tarball '{paths['dest_tar']}'...")
    !rm "{paths['dest_tar']}"
    print("...Local tarball deleted.")

# --- 5. Verify and Set Path for Training ---
print("\n--- Final Data Setup Verification ---")
print(f"Dataset is ready for training at: {FINAL_DATA_PATH}")
!ls -lh "{FINAL_DATA_PATH}"
print("\nLocal VM Disk Space Usage:")
!df -h

Connecting Google Drive...
Mounted at /content/drive
...Google Drive connected.
Local data directory created at: /content/data/inaturalist_unpacked

--- Processing 2021_train_mini ---
Copying '2021_train_mini.tar.gz' from Drive to local VM...
...Copy complete. Took 886.47 seconds.
Unpacking '2021_train_mini.tar.gz' locally...
...Unpacking complete. Took 715.08 seconds.
Deleting local tarball '/content/data/2021_train_mini.tar.gz'...
...Local tarball deleted.

--- Processing 2021_valid ---
Copying '2021_valid.tar.gz' from Drive to local VM...
...Copy complete. Took 158.36 seconds.
Unpacking '2021_valid.tar.gz' locally...
...Unpacking complete. Took 143.66 seconds.
Deleting local tarball '/content/data/2021_valid.tar.gz'...
...Local tarball deleted.

--- Final Data Setup Verification ---
Dataset is ready for training at: /content/data/inaturalist_unpacked
total 2.5M
drwxrwxr-x 10002 1000 1000 1.3M Oct 13  2020 train_mini
drwxrwxr-x 10002 1000 1000 1.3M Oct 13  2020 val

Local VM Disk Spa

In [2]:
#################################################################
#  STEP 2.2: SCIENTIFIC DATA PARTITIONING
#################################################################
print("\n--- STEP 2.2: Loading/Creating Scientific Class Partition ---")

# --- 6. Define Paths for Partition File ---
# We create a 'project_meta' folder on GDrive to store helper files
META_DIR_ON_DRIVE = os.path.join(PROJECT_DIR, 'project_meta')
os.makedirs(META_DIR_ON_DRIVE, exist_ok=True)

PARTITION_FILE_PATH = os.path.join(META_DIR_ON_DRIVE, 'inat_class_split.json')
print(f"Looking for partition file at: {PARTITION_FILE_PATH}")


--- STEP 2.2: Loading/Creating Scientific Class Partition ---
Looking for partition file at: /content/drive/MyDrive/Deep Learning/project_meta/inat_class_split.json


In [4]:
# --- 7. Logic to Find Classes and Create Partition ---

# 7a. Identify the Dataset Root
# The unpacking might have created a subfolder (e.g., '2021_train_mini' or 'train_mini')
# or files might be directly in FINAL_DATA_PATH. We check common patterns.
possible_roots = [
    os.path.join(FINAL_DATA_PATH, '2021_train_mini'),
    os.path.join(FINAL_DATA_PATH, 'train_mini'),
    FINAL_DATA_PATH
]

DATASET_ROOT = None
for path in possible_roots:
    if os.path.exists(path):
        # Check if this path actually contains subdirectories
        if len([d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]) > 0:
            DATASET_ROOT = path
            break

print(f"Dataset root identified as: {DATASET_ROOT}")

# 7b. Load or Create the Partition
partition_data = {}
RANDOM_SEED = 42

if os.path.exists(PARTITION_FILE_PATH):
    print("Found existing partition file. Loading...")
    with open(PARTITION_FILE_PATH, 'r') as f:
        partition_data = json.load(f)
else:
    print("No partition file found. Scanning directories to create new partition...")
    print("This ensures independence from missing metadata files.")

    # --- Scan for Class Folders ---
    class_folders_rel = []

    # Walk through the directory tree
    # A "class" is any folder that contains image files (.jpg, .jpeg, .png)
    print("Scanning folders (this may take 1-2 minutes)...")
    for root, dirs, files in os.walk(DATASET_ROOT):
        # Check for images in this specific folder
        images = [f for f in files if f.lower().endswith(('.jpg', '.jpeg', '.png'))]

        if len(images) > 0:
            # Get path relative to the dataset root (e.g., "Aves/Turdus_migratorius")
            rel_path = os.path.relpath(root, DATASET_ROOT)
            class_folders_rel.append(rel_path)

    # --- CRITICAL: Sort for Reproducibility ---
    # Sorting ensures that Index 0 is ALWAYS the same class on every machine/run
    class_folders_rel.sort()

    num_classes = len(class_folders_rel)
    print(f"Found {num_classes} classes containing images.")

    if num_classes < 9900:
        print("WARNING: Found significantly fewer than 10,000 classes. Check extraction.")

    # --- Assign IDs and Shuffle ---
    all_class_ids = list(range(num_classes))

    print(f"Shuffling {num_classes} class IDs with random seed {RANDOM_SEED}...")
    random.seed(RANDOM_SEED)
    random.shuffle(all_class_ids)

    # --- Split into Sets ---
    # 6000 Base (Train/Meta-Train), 2000 Val (Hyperparams), 2000 Novel (Test)
    c_base_ids = all_class_ids[:6000]
    c_val_ids = all_class_ids[6000:8000]
    c_novel_ids = all_class_ids[8000:]

    # --- Construct Data Structure ---
    # We save both the sets AND the mapping from ID -> Folder Path
    partition_data = {
        "sets": {
            'c_base': sorted(c_base_ids),
            'c_val': sorted(c_val_ids),
            'c_novel': sorted(c_novel_ids)
        },
        "id_to_path": {
            str(i): folder_path for i, folder_path in enumerate(class_folders_rel)
        }
    }

    # --- Save to Drive ---
    print(f"Saving new partition and mapping to: {PARTITION_FILE_PATH}")
    with open(PARTITION_FILE_PATH, 'w') as f:
        json.dump(partition_data, f, indent=4)

Dataset root identified as: /content/data/inaturalist_unpacked/train_mini
No partition file found. Scanning directories to create new partition...
This ensures independence from missing metadata files.
Scanning folders (this may take 1-2 minutes)...
Found 10000 classes containing images.
Shuffling 10000 class IDs with random seed 42...
Saving new partition and mapping to: /content/drive/MyDrive/Deep Learning/project_meta/inat_class_split.json


In [5]:
# --- 8. Verification ---
print("\n--- Partitioning Complete ---")
sets = partition_data['sets']
print(f"Total C_base classes:  {len(sets['c_base'])}")
print(f"Total C_val classes:   {len(sets['c_val'])}")
print(f"Total C_novel classes: {len(sets['c_novel'])}")

# Check for overlaps (should be 0)
base_set = set(sets['c_base'])
val_set = set(sets['c_val'])
novel_set = set(sets['c_novel'])

overlap_bv = base_set & val_set
overlap_bn = base_set & novel_set
overlap_vn = val_set & novel_set

print(f"Overlap (Base-Val):    {len(overlap_bv)}")
print(f"Overlap (Base-Novel):  {len(overlap_bn)}")
print(f"Overlap (Val-Novel):   {len(overlap_vn)}")

if len(overlap_bv) + len(overlap_bn) + len(overlap_vn) == 0:
    print("\nSUCCESS: Classes are cleanly partitioned.")
else:
    print("\nCRITICAL ERROR: Overlaps detected in class sets!")


--- Partitioning Complete ---
Total C_base classes:  6000
Total C_val classes:   2000
Total C_novel classes: 2000
Overlap (Base-Val):    0
Overlap (Base-Novel):  0
Overlap (Val-Novel):   0

SUCCESS: Classes are cleanly partitioned.
