In [13]:
# Verify GPU is available
import torch
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ö†Ô∏è WARNING: No GPU detected! Change Runtime > Change runtime type > GPU")

PyTorch version: 2.9.0+cu126
CUDA available: True
GPU: NVIDIA A100-SXM4-40GB
GPU Memory: 42.47 GB


In [14]:
# Authenticate with Google Cloud
from google.colab import auth
auth.authenticate_user()

# Install gcsfuse for mounting GCS bucket
!echo "deb https://packages.cloud.google.com/apt gcsfuse-bionic main" | sudo tee /etc/apt/sources.list.d/gcsfuse.list
!curl https://packages.cloud.google.com/apt/doc/apt-key.gpg | sudo apt-key add -
!sudo apt-get update
!sudo apt-get install -y gcsfuse

# Create mount point and mount the GCS bucket
!mkdir -p /content/gcs_data
# Unmount first in case it is already mounted to avoid errors on re-run
!fusermount -u /content/gcs_data 2>/dev/null || true
# Mount with --implicit-dirs to show virtual directories
!gcsfuse --implicit-dirs dl-category-agnostic-pose-mp100-data /content/gcs_data

# Verify mount
!ls -lh /content/gcs_data/

deb https://packages.cloud.google.com/apt gcsfuse-bionic main
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  1022  100  1022    0     0  14109      0 --:--:-- --:--:-- --:--:-- 14194
OK
Hit:1 https://cli.github.com/packages stable InRelease
Hit:2 https://cloud.r-project.org/bin/linux/ubuntu jammy-cran40/ InRelease
Hit:3 https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64  InRelease
Hit:4 https://packages.cloud.google.com/apt gcsfuse-bionic InRelease
Hit:5 http://security.ubuntu.com/ubuntu jammy-security InRelease
Hit:6 http://archive.ubuntu.com/ubuntu jammy InRelease
Hit:7 http://archive.ubuntu.com/ubuntu jammy-updates InRelease
Hit:8 http://archive.ubuntu.com/ubuntu jammy-backports InRelease
Hit:9 https://ppa.launchpadcontent.net/deadsnakes/ppa/ubuntu jammy InRelease
Hit:10 https://ppa.launchpadcontent.net/graphics-drivers/ppa/ubuntu jammy InReleas

In [15]:
import os

# path to the mounted data
data_path = '/content/gcs_data'

if os.path.exists(data_path):
    # List directories in the mount point
    dirs = [d for d in os.listdir(data_path) if os.path.isdir(os.path.join(data_path, d))]
    print(f"Found {len(dirs)} directories in {data_path}.")
    print("First 5 directories:", dirs[:5])
else:
    print(f"Path {data_path} does not exist. The mount might have failed.")

Found 100 directories in /content/gcs_data.
First 5 directories: ['amur_tiger_body', 'annotations', 'antelope_body', 'arcticwolf_face', 'beaver_body']


In [16]:
# Clone your repository (use the branch with your latest changes)
import os
from getpass import getpass

repo_name = 'category-agnostic-pose-estimation'
user_name = 'nkkrnkl'

if not os.path.exists(repo_name):
    print("Repository not found locally. Attempting to clone...")
    print("If the repository is private, you will need a Personal Access Token (PAT).")

    # Prompt for token securely
    token = getpass('Enter your GitHub PAT (leave empty to try public clone): ')

    if token.strip():
        !git clone https://{token}@github.com/{user_name}/{repo_name}.git
    else:
        !git clone https://github.com/{user_name}/{repo_name}.git
else:
    print("Repository already exists.")

if os.path.exists(repo_name):
    %cd {repo_name}

    # Checkout specific branch if needed (e.g., pavlos-topic)
    !git fetch origin pavlos-topic
    !git checkout pavlos-topic

    # Verify the clone
    !ls -lh
else:
    print("Failed to clone repository. Please check the URL or your permissions.")

Repository already exists.
/content/category-agnostic-pose-estimation/category-agnostic-pose-estimation
From https://github.com/nkkrnkl/category-agnostic-pose-estimation
 * branch            pavlos-topic -> FETCH_HEAD
Already on 'pavlos-topic'
Your branch is up to date with 'origin/pavlos-topic'.
total 164K
drwxr-xr-x 2 root root 4.0K Nov 25 23:17 annotations
-rw-r--r-- 1 root root  16K Nov 25 23:17 AUXILIARY_LOSS_EXPLANATION.md
-rw-r--r-- 1 root root  112 Nov 25 23:17 category-agnostic-pose-estimation.code-workspace
-rw-r--r-- 1 root root 3.3K Nov 25 23:17 category_splits.json
-rwxr-xr-x 1 root root 1.9K Nov 25 23:17 commit_and_push.sh
drwxr-xr-x 2 root root 4.0K Nov 25 23:17 datasets
drwxr-xr-x 2 root root 4.0K Nov 25 23:17 docs
-rw-r--r-- 1 root root 9.8K Nov 25 23:17 EOS_FIX_COMPLETE.md
-rw-r--r-- 1 root root  12K Nov 25 23:17 EOS_FIX_IMPLEMENTATION_SUMMARY.md
-rw-r--r-- 1 root root 5.2K Nov 25 23:17 FIXES_APPLIED_README.md
-rw-r--r-- 1 root root  12K Nov 25 23:17 KEYPOINT_COUNT_DI

In [18]:
# Install dependencies with relaxed constraints for Python 3.12
# We remove strict pinning for libraries that cause conflicts on this runtime

# Install project dependencies
!pip install -r requirements_cape.txt

# Install additional dependencies
# Using >= to allow compatible newer versions on Python 3.12
!pip install opencv-python "matplotlib>=3.6.0" "imageio>=2.19.0"
!pip install "scipy>=1.10.0" fvcore cloudpickle==2.1.0
!pip install omegaconf==2.2.2 fairscale==0.4.6 "timm>=0.5.4"
!pip install shapely==1.8.2 tqdm==4.64.0 pycocotools
!pip install "scikit-image>=0.19.0"

# Check for MultiScaleDeformableAttention compilation
import os
import sys

ops_dir = os.path.join(os.getcwd(), 'models', 'ops')
if os.path.exists(os.path.join(ops_dir, 'setup.py')):
    print(f"Compiling MultiScaleDeformableAttention in {ops_dir}...")
    !cd {ops_dir} && python setup.py build install
else:
    print(f"‚ÑπÔ∏è No setup.py found in {ops_dir}. Assuming pure Python implementation or pre-compiled.")

# Verify installations
import torch
import torchvision
import numpy as np
print(f"‚úì PyTorch: {torch.__version__}")
print(f"‚úì CUDA: {torch.version.cuda}")
print("‚úì Dependencies installed")

Collecting tqdm>=4.65.0 (from -r requirements_cape.txt (line 31))
  Using cached tqdm-4.67.1-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.67.1-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.64.0
    Uninstalling tqdm-4.64.0:
      Successfully uninstalled tqdm-4.64.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.7.0 requires shapely>=2.1.0, but you have shapely 1.8.2 which is incompatible.
momepy 0.10.0 requires shapely>=2, but you have shapely 1.8.2 which is incompatible.
pysal 25.7 requires shapely>=2.0.1, but you have shapely 1.8.2 which is incompatible.[0m[31m
[0mSuccessfully installed tqdm-4.67.1


Collecting tqdm==4.64.0
  Using cached tqdm-4.64.0-py2.py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.64.0-py2.py3-none-any.whl (78 kB)
Installing collected packages: tqdm
  Attempting uninstall: tqdm
    Found existing installation: tqdm 4.67.1
    Uninstalling tqdm-4.67.1:
      Successfully uninstalled tqdm-4.67.1
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
spopt 0.7.0 requires shapely>=2.1.0, but you have shapely 1.8.2 which is incompatible.
spopt 0.7.0 requires tqdm>=4.66.0, but you have tqdm 4.64.0 which is incompatible.
dataproc-spark-connect 0.8.3 requires tqdm>=4.67, but you have tqdm 4.64.0 which is incompatible.
momepy 0.10.0 requires shapely>=2, but you have shapely 1.8.2 which is incompatible.
momepy 0.10.0 requires tqdm>=4.65, but you have tqdm 4.64.0 which is incompatible.
pysal 25.7 requires shapely>=2.0.1, but you have shapely 1

‚ÑπÔ∏è No setup.py found in /content/category-agnostic-pose-estimation/category-agnostic-pose-estimation/models/ops. Assuming pure Python implementation or pre-compiled.
‚úì PyTorch: 2.9.0+cu126
‚úì CUDA: 12.6
‚úì Dependencies installed


In [21]:
import os

# 1. Verify where MultiScaleDeformableAttention is defined
# If this returns a file like 'models/deformable_transformer.py', we are good.
print("Searching for MultiScaleDeformableAttention definition...")
!grep -r "class MultiScaleDeformableAttention" .

# 2. Create symbolic links so the code can access GCS data
# The dataset expects 'data/' and 'annotations/' in the project root
os.chdir('/content/category-agnostic-pose-estimation')

print("\nSetting up data links...")
# Force create symlinks to GCS bucket
!ln -sf /content/gcs_data data
!ln -sf /content/gcs_data/annotations annotations

# Verify the links work
!ls -lh data/ | head -5
!ls -lh annotations/

Searching for MultiScaleDeformableAttention definition...

Setting up data links...
total 0
drwxr-xr-x 1 root root 0 Nov 25 23:27 amur_tiger_body
drwxr-xr-x 1 root root 0 Nov 25 23:27 annotations
drwxr-xr-x 1 root root 0 Nov 25 23:27 antelope_body
drwxr-xr-x 1 root root 0 Nov 25 23:27 arcticwolf_face
total 320M
lrwxrwxrwx 1 root root   29 Nov 25 23:28 annotations -> /content/gcs_data/annotations
-rw-r--r-- 1 root root 2.8M Nov 25 23:07 mp100_split1_test.json
-rw-r--r-- 1 root root 4.1M Nov 25 23:07 mp100_split1_test.json.backup
-rw-r--r-- 1 root root  21M Nov 25 23:07 mp100_split1_train.json
-rw-r--r-- 1 root root  32M Nov 25 23:07 mp100_split1_train.json.backup
-rw-r--r-- 1 root root 2.1M Nov 25 23:07 mp100_split1_val.json
-rw-r--r-- 1 root root 3.9M Nov 25 23:07 mp100_split1_val.json.backup
-rw-r--r-- 1 root root 3.5M Nov 25 23:07 mp100_split2_test.json
-rw-r--r-- 1 root root 5.3M Nov 25 23:07 mp100_split2_test.json.backup
-rw-r--r-- 1 root root  18M Nov 25 23:07 mp100_split2_train.j

In [26]:
# Set training configuration
import os

# Ensure we are in the repository root so python can find 'models'
repo_path = '/content/category-agnostic-pose-estimation'
if os.path.exists(repo_path):
    os.chdir(repo_path)
    print(f"Current working directory: {os.getcwd()}")
else:
    print(f"‚ö†Ô∏è Warning: Repository not found at {repo_path}")

# Output directory (save to Google Drive or GCS)
OUTPUT_DIR = "/content/output_cape_colab"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# Training parameters
EPOCHS = 300  # Use 5 for quick test, 300 for full training
EPISODES_PER_EPOCH = 1000  # Use 100 for quick test
BATCH_SIZE = 2
NUM_QUERIES = 2
DEVICE = "cuda:0"

print(f"Configuration:")
print(f"  Epochs: {EPOCHS}")
print(f"  Episodes/epoch: {EPISODES_PER_EPOCH}")
print(f"  Batch size: {BATCH_SIZE}")
print(f"  Device: {DEVICE}")
print(f"  Output: {OUTPUT_DIR}")

Current working directory: /content/category-agnostic-pose-estimation
Configuration:
  Epochs: 300
  Episodes/epoch: 1000
  Batch size: 2
  Device: cuda:0
  Output: /content/output_cape_colab


In [23]:
# Mount Google Drive to save checkpoints (recommended for persistence)
from google.colab import drive
drive.mount('/content/drive')

# Update output directory to save in Drive
OUTPUT_DIR = "/content/drive/MyDrive/cape_training_output"
os.makedirs(OUTPUT_DIR, exist_ok=True)
print(f"Checkpoints will be saved to: {OUTPUT_DIR}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Checkpoints will be saved to: /content/drive/MyDrive/cape_training_output


In [28]:
# Inspect the training script to see how vocab_size is passed to the model
!grep -C 5 "vocab_size" models/train_cape_episodic.py

# Also check the model definition to see what vocab_size it expects
!grep -C 5 "Embedding" models/roomformer_v2.py | head -20

    parser.add_argument('--poly2seq', action='store_true', default=True,
                        help='Enable poly2seq mode (sequence-to-sequence for keypoints)')
    parser.add_argument('--num_queries', default=200, type=int)
    parser.add_argument('--seq_len', default=200, type=int)
    parser.add_argument('--num_polys', default=1, type=int)
    parser.add_argument('--vocab_size', default=2000, type=int)
    parser.add_argument('--masked_attn', action='store_true', default=False)

    parser.add_argument('--dec_n_points', default=4, type=int)
    parser.add_argument('--enc_n_points', default=4, type=int)
    parser.add_argument('--query_pos_type', default='sine', type=str)
--
    val_dataset = build_mp100_cape('val', args)
    
    # Get tokenizer from dataset
    tokenizer = train_dataset.get_tokenizer()
    print(f"Tokenizer: {tokenizer}")
    print(f"  vocab_size: {len(tokenizer) if tokenizer else 'N/A'}")
    print(f"  num_bins: {tokenizer.num_bins if tokenizer else 'N/A'}")
   

In [29]:
# Read the training script to find where the model is built
with open('models/train_cape_episodic.py', 'r') as f:
    lines = f.readlines()

print("Searching for model initialization in train_cape_episodic.py...")
for i, line in enumerate(lines):
    if "build_model" in line or "Raster2Seq" in line or "vocab_size" in line:
        # Print context around the match
        start = max(0, i - 5)
        end = min(len(lines), i + 5)
        print(f"--- Lines {start}-{end} ---")
        print("".join(lines[start:end]))
        print("---------------------------")

Searching for model initialization in train_cape_episodic.py...
--- Lines 35-45 ---
import numpy as np
import torch
from torch.utils.data import DataLoader
import util.misc as utils
from datasets import build_dataset
from models import build_model
from models.cape_model import build_cape_model
from models.cape_losses import build_cape_criterion

# Will create these functions below

---------------------------
--- Lines 112-122 ---
    parser.add_argument('--poly2seq', action='store_true', default=True,
                        help='Enable poly2seq mode (sequence-to-sequence for keypoints)')
    parser.add_argument('--num_queries', default=200, type=int)
    parser.add_argument('--seq_len', default=200, type=int)
    parser.add_argument('--num_polys', default=1, type=int)
    parser.add_argument('--vocab_size', default=2000, type=int)
    parser.add_argument('--masked_attn', action='store_true', default=False)

    parser.add_argument('--dec_n_points', default=4, type=int)
    parser.ad

In [27]:
import sys
import os

# --- FIX: Patch vocab_size mismatch ---
# The dataset has 1940 tokens, but model likely defaults to less.
# We patch the script to force vocab_size = 2000 before model build.
script_path = 'models/train_cape_episodic.py'
if os.path.exists(script_path):
    with open(script_path, 'r') as f:
        content = f.read()

    # Only patch if not already patched
    if 'args.vocab_size = 2000' not in content:
        print("üîß Patching vocab_size in training script...")
        # Inject the fix before the model is initialized
        new_content = content.replace(
            'model = build_model(args)',
            'if hasattr(args, "vocab_size"): args.vocab_size = 2000\n    model = build_model(args)'
        )

        if new_content != content:
            with open(script_path, 'w') as f:
                f.write(new_content)
            print("‚úÖ Script patched: Set vocab_size to 2000 to match dataset.")
        else:
            print("‚ö†Ô∏è Could not auto-patch script. Please check 'build_model' call.")
    else:
        print("‚úÖ Script already patched.")
# --------------------------------------

# Check if variables are defined (in case user skipped config cell)
try:
    # Verify critical variables exist
    _ = [EPOCHS, BATCH_SIZE, NUM_QUERIES, EPISODES_PER_EPOCH, OUTPUT_DIR, DEVICE]

    print(f"üöÄ Starting training with configuration:")
    print(f"  Epochs: {EPOCHS}")
    print(f"  Batch Size: {BATCH_SIZE}")
    print(f"  Device: {DEVICE}")
    print(f"  Output Dir: {OUTPUT_DIR}")

except NameError as e:
    print(f"‚ùå Configuration missing. Please run the 'Set training configuration' cell above first.\nError: {e}")
    sys.exit(1)

# Run training script
!python3 -m models.train_cape_episodic \
    --epochs {EPOCHS} \
    --batch_size {BATCH_SIZE} \
    --accumulation_steps 4 \
    --num_queries_per_episode {NUM_QUERIES} \
    --episodes_per_epoch {EPISODES_PER_EPOCH} \
    --early_stopping_patience 20 \
    --output_dir "{OUTPUT_DIR}" \
    --dataset_root /content/category-agnostic-pose-estimation \
    --device {DEVICE}

üöÄ Starting training with configuration:
  Epochs: 300
  Batch Size: 2
  Device: cuda:0
  Output Dir: /content/output_cape_colab
Category-Agnostic Pose Estimation (CAPE) - Episodic Training

Mode: Episodic meta-learning with support pose graphs
Support encoder layers: 3
Fusion method: cross_attention
Queries per episode: 2
Episodes per epoch: 1000

Using device: cuda:0
  GPU: NVIDIA A100-SXM4-40GB
  CUDA Version: 12.6
  GPU Memory: 39.56 GB

  A.GaussNoise(
loading annotations into memory...
Done (t=0.27s)
creating index...
index created!
üìä Multi-instance statistics:
   - Images with multiple instances: 615/12816 (4.8%)
   - Total instances available: 13712
   - Instances actually used: 12816 (93.5%)
   - Instances skipped: 896 (6.5%)
   - Max instances in single image: 9
   ‚ö†Ô∏è  Note: Currently using only first instance per image
Loaded MP-100 train dataset: 12816 images
loading annotations into memory...
Done (t=0.03s)
creating index...
index created!
üìä Multi-instance stat