<a href="https://colab.research.google.com/github/marcvonrohr/DeepLearning/blob/main/meta_learning_dataset_download.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import os
import time
from google.colab import drive

# --- 1. Mount Google Drive ---
print("Connecting Google Drive...")
drive.mount('/content/drive')
print("...Google Drive connected.")

# --- 2. Define All Google Drive Paths ---
GDRIVE_ROOT = '/content/drive/MyDrive/'
PROJECT_DIR = os.path.join(GDRIVE_ROOT, 'Deep Learning')
DATASETS_ROOT_DIR = os.path.join(PROJECT_DIR, 'datasets')
INAT_ROOT_DIR = os.path.join(DATASETS_ROOT_DIR, 'inaturalist')

# Folder for the COMPRESSED .tar.gz files
ARCHIVES_DIR = os.path.join(INAT_ROOT_DIR, 'archives')
# Folder for the EXTRACTED browseable files
EXTRACTED_DIR_ON_DRIVE = os.path.join(INAT_ROOT_DIR, 'extracted')

# Create all necessary GDrive directories
os.makedirs(ARCHIVES_DIR, exist_ok=True)
os.makedirs(EXTRACTED_DIR_ON_DRIVE, exist_ok=True)

print(f"Archive directory (storage): {ARCHIVES_DIR}")
print(f"Extracted directory (browsing): {EXTRACTED_DIR_ON_DRIVE}")

# --- 3. Define Local VM Paths (Temporary Workspace) ---
LOCAL_DATA_ROOT = '/content/data_temp'
LOCAL_TAR_FILE = os.path.join(LOCAL_DATA_ROOT, 'temp.tar.gz')
LOCAL_UNPACK_DIR = os.path.join(LOCAL_DATA_ROOT, 'unpacked')

# Create local VM directories
os.makedirs(LOCAL_DATA_ROOT, exist_ok=True)
os.makedirs(LOCAL_UNPACK_DIR, exist_ok=True)
print(f"Temporary VM workspace created at: {LOCAL_DATA_ROOT}")

Connecting Google Drive...
Mounted at /content/drive
...Google Drive connected.
Archive directory (storage): /content/drive/MyDrive/Deep Learning/datasets/inaturalist/archives
Extracted directory (browsing): /content/drive/MyDrive/Deep Learning/datasets/inaturalist/extracted
Temporary VM workspace created at: /content/data_temp


In [5]:
#################################################################
#  PHASE 1: DOWNLOAD ARCHIVES TO GDRIVE
#################################################################
print("\n--- PHASE 1: Downloading Archives to Google Drive ---")

# --- 1a. Define Download URLs and Target Paths ---
urls = {
    "2021_train_mini": "https://ml-inat-competition-datasets.s3.amazonaws.com/2021/train_mini.tar.gz",
    "2021_valid": "https://ml-inat-competition-datasets.s3.amazonaws.com/2021/val.tar.gz"
}

files_to_download = {
    "2021_train_mini": os.path.join(ARCHIVES_DIR, '2021_train_mini.tar.gz'),
    "2021_valid": os.path.join(ARCHIVES_DIR, '2021_valid.tar.gz')
}

# --- 1b. Download Files using wget ---
for name, path in files_to_download.items():
    if not os.path.exists(path):
        print(f"\nDownloading '{name}.tar.gz'...")
        print(f"Source: {urls[name]}")
        print(f"Destination: {path}")
        print("This may take a long time. Colab Pro session recommended.")
        !wget -O "{path}" "{urls[name]}"
        print(f"...Download '{name}' complete.")
    else:
        print(f"'{name}.tar.gz' already found at {path}. Skipping download.")

print("\n--- PHASE 1 Complete: Archives are on Google Drive. ---")
!ls -lh "$ARCHIVES_DIR"


--- PHASE 1: Downloading Archives to Google Drive ---

Downloading '2021_train_mini.tar.gz'...
Source: https://ml-inat-competition-datasets.s3.amazonaws.com/2021/train_mini.tar.gz
Destination: /content/drive/MyDrive/Deep Learning/datasets/inaturalist/archives/2021_train_mini.tar.gz
This may take a long time. Colab Pro session recommended.
--2025-10-25 11:05:38--  https://ml-inat-competition-datasets.s3.amazonaws.com/2021/train_mini.tar.gz
Resolving ml-inat-competition-datasets.s3.amazonaws.com (ml-inat-competition-datasets.s3.amazonaws.com)... 52.219.103.36, 52.219.109.60, 52.219.177.20, ...
Connecting to ml-inat-competition-datasets.s3.amazonaws.com (ml-inat-competition-datasets.s3.amazonaws.com)|52.219.103.36|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 44636137542 (42G) [application/gzip]
Saving to: ‘/content/drive/MyDrive/Deep Learning/datasets/inaturalist/archives/2021_train_mini.tar.gz’


2025-10-25 11:37:10 (22.5 MB/s) - ‘/content/drive/MyDrive/Deep

In [6]:
#################################################################
#  PHASE 2: UNPACK ARCHIVES TO GDRIVE (using VM as temp)
#################################################################
print("\n--- PHASE 2: Unpacking Archives to GDrive (for browsing) ---")
print("This phase uses the local VM for fast unpacking.")

# --- 2a. List of files to process ---
files_to_process = {
    "2021_train_mini": {
        "archive_path": files_to_download["2021_train_mini"],
        "final_check_path": os.path.join(EXTRACTED_DIR_ON_DRIVE, '2021_train_mini')
    },
    "2021_valid": {
        "archive_path": files_to_download["2021_valid"],
        "final_check_path": os.path.join(EXTRACTED_DIR_ON_DRIVE, '2021_valid')
    }
}

# --- 2b. Process each file ---
for name, paths in files_to_process.items():
    print(f"\n--- Processing {name} ---")

    # Check if it's already done
    if os.path.exists(paths["final_check_path"]):
        print(f"'{name}' already exists in GDrive 'extracted' folder. Skipping.")
        continue

    # Copy archive from GDrive to local VM (Fast)
    print(f"Copying '{name}.tar.gz' from Drive to local VM...")
    start_time = time.time()
    !cp "{paths['archive_path']}" "{LOCAL_TAR_FILE}"
    print(f"...Copy complete. Took {time.time() - start_time:.2f}s.")

    # Unpack locally on VM (Fast)
    print(f"Unpacking '{name}' on local VM...")
    start_time = time.time()
    !tar -xzf "{LOCAL_TAR_FILE}" -C "{LOCAL_UNPACK_DIR}"
    print(f"...Unpack complete. Took {time.time() - start_time:.2f}s.")

    # Identify the folder name (e.g., '2021_train_mini')
    unpacked_folder_name = os.listdir(LOCAL_UNPACK_DIR)[0]
    local_path_to_copy = os.path.join(LOCAL_UNPACK_DIR, unpacked_folder_name)

    # Copy the *unpacked folder* from VM back to GDrive (Slow)
    print(f"Copying unpacked folder '{unpacked_folder_name}' from VM to GDrive...")
    print("WARNING: This will take MANY HOURS for 'train_mini'.")
    start_time = time.time()
    !cp -r "{local_path_to_copy}" "{EXTRACTED_DIR_ON_DRIVE}"
    print(f"...Copy to GDrive complete. Took {time.time() - start_time:.2f}s.")

    # Clean up local VM
    print("Cleaning up local VM workspace...")
    !rm "{LOCAL_TAR_FILE}"
    !rm -r "{local_path_to_copy}"
    print("...VM workspace clean.")

print("\n--- PHASE 2 Complete: Setup script finished. ---")
print(f"Contents of {EXTRACTED_DIR_ON_DRIVE} (for browsing):")
!ls -lh "$EXTRACTED_DIR_ON_DRIVE"


--- PHASE 2: Unpacking Archives to GDrive (for browsing) ---
This phase uses the local VM for fast unpacking.

--- Processing 2021_train_mini ---
Copying '2021_train_mini.tar.gz' from Drive to local VM...
...Copy complete. Took 416.10s.
Unpacking '2021_train_mini' on local VM...
...Unpack complete. Took 674.85s.
Copying unpacked folder 'train_mini' from VM to GDrive...
...Copy to GDrive complete. Took 6444.88s.
Cleaning up local VM workspace...
...VM workspace clean.

--- Processing 2021_valid ---
Copying '2021_valid.tar.gz' from Drive to local VM...
...Copy complete. Took 174.03s.
Unpacking '2021_valid' on local VM...
...Unpack complete. Took 158.23s.
Copying unpacked folder 'val' from VM to GDrive...
...Copy to GDrive complete. Took 1431.32s.
Cleaning up local VM workspace...
...VM workspace clean.

--- PHASE 2 Complete: Setup script finished. ---
Contents of /content/drive/MyDrive/Deep Learning/datasets/inaturalist/extracted (for browsing):
total 8.0K
drwx------ 10002 root root 4.0