# MP-100 CAPE Training on Google Colab

This notebook trains Category-Agnostic Pose Estimation (CAPE) on the MP-100 dataset using Google Colab's GPU.

## Setup Instructions
1. Enable GPU: Runtime → Change runtime type → GPU (T4 or better)
2. Run all cells in order
3. The notebook will:
   - Clone code from GitHub
   - Install dependencies
   - Authenticate to GCP
   - Mount GCS bucket with data
   - Run training with "tiny" mode


## 1. Check GPU Availability


In [None]:
# Check GPU availability
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"CUDA version: {torch.version.cuda}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("⚠️  No GPU detected! Please enable GPU in Runtime > Change runtime type > GPU")


In [None]:
# Clone repository from GitHub
import os
from getpass import getpass

REPO_URL = "https://github.com/nkkrnkl/category-agnostic-pose-estimation.git"
BRANCH = "teo-branch-copy"
PROJECT_ROOT = "/content/category-agnostic-pose-estimation"

# Remove existing directory if it exists
if os.path.exists(PROJECT_ROOT):
    print(f"Removing existing directory: {PROJECT_ROOT}")
    !rm -rf {PROJECT_ROOT}

# For private repositories, you need to authenticate
# Option 1: Use Personal Access Token (recommended)
# Get token from: https://github.com/settings/tokens
# Create a token with 'repo' scope
print("For private repositories, you need to authenticate.")
print("Option 1: Enter your GitHub Personal Access Token")
print("  (Get one from: https://github.com/settings/tokens)")
print("Option 2: Press Enter to try without token (will fail if repo is private)")
print()

GITHUB_TOKEN = getpass("Enter GitHub Personal Access Token (or press Enter to skip): ")

if GITHUB_TOKEN.strip():
    # Use token in URL
    # Format: https://TOKEN@github.com/username/repo.git
    AUTH_REPO_URL = REPO_URL.replace("https://github.com/", f"https://{GITHUB_TOKEN}@github.com/")
    print(f"Cloning repository from {REPO_URL} (branch: {BRANCH})...")
    !git clone -b {BRANCH} {AUTH_REPO_URL} {PROJECT_ROOT}
else:
    # Try without token (will work if repo is public)
    print(f"Cloning repository from {REPO_URL} (branch: {BRANCH})...")
    !git clone -b {BRANCH} {REPO_URL} {PROJECT_ROOT}

# Verify clone
if os.path.exists(PROJECT_ROOT) and os.path.exists(os.path.join(PROJECT_ROOT, ".git")):
    print(f"✅ Repository cloned successfully to {PROJECT_ROOT}")
    !cd {PROJECT_ROOT} && git branch
else:
    print("❌ Failed to clone repository")
    print("\nIf the repository is private, you need to:")
    print("1. Create a Personal Access Token at: https://github.com/settings/tokens")
    print("2. Select 'repo' scope")
    print("3. Run this cell again and paste the token when prompted")


# Clone repository from GitHub
import os

REPO_URL = "https://github.com/nkkrnkl/category-agnostic-pose-estimation.git"
BRANCH = "teo-branch-copy"
PROJECT_ROOT = "/content/category-agnostic-pose-estimation"

# Remove existing directory if it exists
if os.path.exists(PROJECT_ROOT):
    print(f"Removing existing directory: {PROJECT_ROOT}")
    !rm -rf {PROJECT_ROOT}

# Clone the repository
print(f"Cloning repository from {REPO_URL} (branch: {BRANCH})...")
!git clone -b {BRANCH} {REPO_URL} {PROJECT_ROOT}

# Verify clone
if os.path.exists(PROJECT_ROOT):
    print(f"✅ Repository cloned successfully to {PROJECT_ROOT}")
    !cd {PROJECT_ROOT} && git branch
else:
    print("❌ Failed to clone repository")


## 3. Install Requirements


In [None]:
# Install additional dependencies needed for plot_utils and other utilities
# (descartes, shapely, etc. - these are in requirements.txt but not requirements_cape.txt)
print("Installing additional dependencies (descartes, shapely, etc.)...")
!pip install -q descartes shapely>=1.8.0
print("✅ Additional dependencies installed!")


In [None]:
# Install requirements
import os

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
REQUIREMENTS_FILE = os.path.join(PROJECT_ROOT, "requirements_cape.txt")

print("Installing requirements from requirements_cape.txt...")
!cd {PROJECT_ROOT} && pip install -q -r {REQUIREMENTS_FILE}

# Install detectron2 for CUDA 11.8 (Colab typically has CUDA 11.8)
print("\nInstalling detectron2...")
!pip install -q 'git+https://github.com/facebookresearch/detectron2.git'

print("✅ All dependencies installed!")


## 4. Authenticate to GCP


In [None]:
# Authenticate to GCP
from google.colab import auth

print("Authenticating to GCP...")
auth.authenticate_user()

# Set GCP project
GCP_PROJECT = "dl-category-agnostic-pose-est"
!gcloud config set project {GCP_PROJECT}

print(f"✅ Authenticated to GCP project: {GCP_PROJECT}")


## 5. Mount GCS Bucket


In [None]:
# Verify data access before training
import os
from pathlib import Path

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
DATA_DIR = os.path.join(PROJECT_ROOT, "data")

print("Verifying data access...")
print(f"Data directory: {DATA_DIR}")
print(f"Exists: {os.path.exists(DATA_DIR)}")
print(f"Is symlink: {os.path.islink(DATA_DIR)}")

if os.path.exists(DATA_DIR):
    # Check if we can list directories
    try:
        categories = [d for d in os.listdir(DATA_DIR) if os.path.isdir(os.path.join(DATA_DIR, d))]
        print(f"✅ Found {len(categories)} category directories")
        if len(categories) > 0:
            print(f"   First 5 categories: {categories[:5]}")
            
            # Try to access a file in the first category
            first_cat = categories[0]
            cat_dir = os.path.join(DATA_DIR, first_cat)
            files = [f for f in os.listdir(cat_dir) if f.endswith(('.jpg', '.png', '.jpeg'))]
            if len(files) > 0:
                test_file = os.path.join(cat_dir, files[0])
                print(f"   Test file exists: {os.path.exists(test_file)}")
                print(f"   Test file: {test_file}")
            else:
                print(f"   ⚠️  No image files found in {first_cat}")
    except Exception as e:
        print(f"❌ Error accessing data directory: {e}")
        print("   This might indicate the GCS mount is not working properly")
else:
    print(f"❌ Data directory does not exist: {DATA_DIR}")
    print("   Please check:")
    print("   1. GCS bucket is mounted")
    print("   2. Data symlink is created")


In [None]:
# Mount GCS bucket using gcsfuse
import os

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
BUCKET_NAME = "dl-category-agnostic-pose-mp100-data"
MOUNT_POINT = os.path.join(PROJECT_ROOT, "Raster2Seq_internal-main", "data")

# Install gcsfuse if not already installed
!apt-get update -qq
!apt-get install -qq gcsfuse

# Create mount point directory
os.makedirs(MOUNT_POINT, exist_ok=True)

# Mount the bucket
print(f"Mounting gs://{BUCKET_NAME} to {MOUNT_POINT}...")
!gcsfuse --implicit-dirs {BUCKET_NAME} {MOUNT_POINT}

# Verify mount
if os.path.exists(MOUNT_POINT):
    print(f"✅ GCS bucket mounted successfully!")
    print(f"Mount point: {MOUNT_POINT}")
    # List a few items to verify
    !ls {MOUNT_POINT} | head -10
else:
    print("❌ Failed to mount GCS bucket")


## 6. Create Data Symlink


In [None]:
# Create symlink from data to mounted GCS bucket (as expected by START_TRAINING.sh)
import os

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
MOUNTED_DATA = os.path.join(PROJECT_ROOT, "Raster2Seq_internal-main", "data")
DATA_SYMLINK = os.path.join(PROJECT_ROOT, "data")

# Remove existing symlink or directory if it exists
if os.path.exists(DATA_SYMLINK):
    if os.path.islink(DATA_SYMLINK):
        os.unlink(DATA_SYMLINK)
    else:
        print(f"Warning: {DATA_SYMLINK} exists and is not a symlink")

# Create symlink
if os.path.exists(MOUNTED_DATA):
    os.symlink(MOUNTED_DATA, DATA_SYMLINK)
    print(f"✅ Created symlink: {DATA_SYMLINK} -> {MOUNTED_DATA}")
else:
    print(f"⚠️  Mounted data not found at {MOUNTED_DATA}")


In [None]:
## 7. Run Training


# Run training using START_TRAINING.sh with "tiny" mode


In [None]:
# Run START_TRAINING.sh with "tiny" mode
import os

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
TRAINING_SCRIPT = os.path.join(PROJECT_ROOT, "START_TRAINING.sh")

# Make script executable
!chmod +x {TRAINING_SCRIPT}

# Change to project directory and run training
print("Starting training with 'tiny' mode...")
print("This will run 5 epochs with batch_size 8 (~30-60 min)")
print("=" * 80)

!cd {PROJECT_ROOT} && bash {TRAINING_SCRIPT} tiny


In [None]:
## 8. Monitor Training (Optional)


In [None]:
# Check training logs
import json
import os
from pathlib import Path

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
OUTPUT_DIR = os.path.join(PROJECT_ROOT, "output", "tiny_test", "tiny_test")
LOG_FILE = os.path.join(OUTPUT_DIR, "log.txt")

if os.path.exists(LOG_FILE):
    print(f"Reading log file: {LOG_FILE}")
    with open(LOG_FILE, 'r') as f:
        lines = f.readlines()
        print(f"Total log entries: {len(lines)}")
        if lines:
            print("\nLast 3 entries:")
            for line in lines[-3:]:
                try:
                    stats = json.loads(line.strip())
                    print(f"  Epoch {stats.get('epoch', 'N/A')}: ")
                    print(f"    Train Loss: {stats.get('train_loss', stats.get('loss', 'N/A'))}")
                    print(f"    Val Loss: {stats.get('test_loss', 'N/A')}")
                except:
                    pass
else:
    print(f"Log file not found: {LOG_FILE}")
    print("\nAvailable output directories:")
    output_base = os.path.join(PROJECT_ROOT, "output")
    if os.path.exists(output_base):
        for d in os.listdir(output_base):
            print(f"  - {os.path.join(output_base, d)}")


In [None]:
## 9. Download Results (Optional)


In [None]:
# Download checkpoints and logs
from google.colab import files
from pathlib import Path
import zipfile
import os

PROJECT_ROOT = "/content/category-agnostic-pose-estimation"
OUTPUT_BASE = os.path.join(PROJECT_ROOT, "output")

# Find all checkpoints
checkpoints = list(Path(OUTPUT_BASE).rglob("checkpoint*.pth"))

if checkpoints:
    print(f"Found {len(checkpoints)} checkpoint(s)")
    
    # Create zip with all checkpoints
    zip_path = "/content/checkpoints.zip"
    with zipfile.ZipFile(zip_path, 'w') as zipf:
        for cp in checkpoints:
            # Preserve relative path structure
            rel_path = os.path.relpath(cp, PROJECT_ROOT)
            zipf.write(cp, rel_path)
    
    print(f"\nDownloading {zip_path}...")
    files.download(zip_path)
    print("✅ Download complete!")
else:
    print("No checkpoints found yet.")
    print(f"Output directory: {OUTPUT_BASE}")
