# üé¨ YOWO Multi-Task Training on Google Colab

**Model**: `yowo_v2_x3d_m_yolo11m_multitask`  
**Dataset**: Charades + Action Genome (288K keyframes, 219 classes)

### Optimized Batch Sizes (with AMP)

| GPU | VRAM | Batch | Accum | Effective | Est. Time/Epoch |
|-----|------|-------|-------|-----------|-----------------|
| T4 | 16GB | 8 | 4 | 32 | ~4 hours |
| L4 | 24GB | 12 | 4 | 48 | ~2.5 hours |
| V100 | 16GB | 10 | 4 | 40 | ~2 hours |
| A100 | 40GB | 32 | 2 | 64 | ~50 min |
| A100 | 80GB | 64 | 2 | 128 | ~30 min |
| H100 | 80GB | 80 | 2 | 160 | ~20 min |

**Features**: AMP (FP16), Multi-head (Objects + Actions + Relationships)


In [None]:
# Cell 1: Check GPU & Auto-Configure Batch Size
import torch
print("=" * 70)
print("üîç GPU Detection & Configuration")
print("=" * 70)

if not torch.cuda.is_available():
    raise RuntimeError("‚ùå No GPU! Go to Runtime > Change runtime type > GPU")

gpu_name = torch.cuda.get_device_name(0)
gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9

print(f"‚úÖ GPU: {gpu_name}")
print(f"‚úÖ VRAM: {gpu_memory_gb:.1f} GB")

# =============================================================================
# OPTIMIZED BATCH SIZES FOR YOWO V2 + X3D-M + YOLO11m WITH AMP
# Based on empirical testing of video action detection models
# AMP reduces memory by ~40%, allowing larger batches
# =============================================================================
if "A100" in gpu_name or "A100" in gpu_name.upper():
    if gpu_memory_gb > 45:  # A100 80GB
        BATCH_SIZE, ACCUMULATE = 64, 2   # Effective: 128 (can try 80 if stable)
    else:  # A100 40GB
        BATCH_SIZE, ACCUMULATE = 32, 2   # Effective: 64 (can try 40-48)
elif "H100" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 80, 2       # Effective: 160 (can try 96)
elif "L4" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 12, 4       # Effective: 48
elif "T4" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 8, 4        # Effective: 32 (can try 10)
elif "V100" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 10, 4       # Effective: 40
elif "P100" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 6, 4        # Effective: 24
else:
    # Unknown GPU - use conservative settings based on memory
    if gpu_memory_gb >= 40:
        BATCH_SIZE, ACCUMULATE = 32, 2
    elif gpu_memory_gb >= 20:
        BATCH_SIZE, ACCUMULATE = 12, 4
    else:
        BATCH_SIZE, ACCUMULATE = 8, 4

effective = BATCH_SIZE * ACCUMULATE
print(f"\nüì¶ Optimized for {gpu_name}:")
print(f"   batch_size = {BATCH_SIZE}")
print(f"   accumulate = {ACCUMULATE}")
print(f"   effective_batch = {effective}")
print(f"\nüí° If OOM: reduce BATCH_SIZE by 2, increase ACCUMULATE proportionally")
print("=" * 70)


In [None]:
# Cell 2: Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

import os
TAR_PATH = "/content/drive/MyDrive/yooowo/frames.tar"
if os.path.exists(TAR_PATH):
    size_gb = os.path.getsize(TAR_PATH) / 1e9
    print(f"‚úÖ Found frames.tar ({size_gb:.2f} GB)")
else:
    print(f"‚ùå frames.tar not found at {TAR_PATH}")


In [None]:
# Cell 3: Clone Repository & Install Dependencies
%cd /content
!rm -rf yowo
!git clone https://github.com/michelsedgh/yowo.git
%cd yowo
!pip install -q torch torchvision opencv-python thop scipy matplotlib numpy imageio pytorchvideo ultralytics tensorboard
print("‚úÖ Repository cloned and dependencies installed!")


In [None]:
# Cell 4: Download Annotations & Extract Frames
import os, time, requests, zipfile

DATA_ROOT = "/content/yowo/data/ActionGenome"
FRAMES_DIR = os.path.join(DATA_ROOT, "frames")
ANN_DIR = os.path.join(DATA_ROOT, "annotations")
TAR_PATH = "/content/drive/MyDrive/yooowo/frames.tar"

os.makedirs(ANN_DIR, exist_ok=True)

# =============================================================================
# STEP 1: Download Action Genome annotations (PKL files NOT in git repo!)
# =============================================================================
print("=" * 60)
print("üì• STEP 1: Downloading Action Genome Annotations")
print("=" * 60)

def download_file(url, filepath):
    if os.path.exists(filepath):
        size = os.path.getsize(filepath) / 1e6
        print(f"   ‚úÖ {os.path.basename(filepath)} exists ({size:.1f} MB)")
        return True
    print(f"   Downloading {os.path.basename(filepath)}...")
    try:
        response = requests.get(url, stream=True, timeout=120)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            size = os.path.getsize(filepath) / 1e6
            print(f"   ‚úÖ Downloaded ({size:.1f} MB)")
            return True
    except Exception as e:
        print(f"   ‚ùå Failed: {e}")
    return False

# Action Genome annotations from STAR Benchmark S3
ag_files = {
    'object_bbox_and_relationship.pkl': 'https://star-benchmark.s3.us-east.cloud-object-storage.appdomain.cloud/Annotations/object_bbox_and_relationship.pkl',
    'person_bbox.pkl': 'https://star-benchmark.s3.us-east.cloud-object-storage.appdomain.cloud/Annotations/person_bbox.pkl',
    'classes.zip': 'https://star-benchmark.s3.us-east.cloud-object-storage.appdomain.cloud/Annotations/classes.zip'
}

for filename, url in ag_files.items():
    download_file(url, os.path.join(ANN_DIR, filename))

# Extract classes.zip if needed
classes_zip = os.path.join(ANN_DIR, 'classes.zip')
if os.path.exists(classes_zip) and not os.path.exists(os.path.join(ANN_DIR, 'object_classes.txt')):
    print("   Extracting classes.zip...")
    with zipfile.ZipFile(classes_zip, 'r') as z:
        z.extractall(ANN_DIR)
    # Move files from classes/ subdirectory if needed
    classes_subdir = os.path.join(ANN_DIR, 'classes')
    if os.path.exists(classes_subdir):
        import shutil
        for f in os.listdir(classes_subdir):
            shutil.move(os.path.join(classes_subdir, f), os.path.join(ANN_DIR, f))
        shutil.rmtree(classes_subdir)
    print("   ‚úÖ Extracted class files")



In [None]:
import os
import subprocess
import google.auth  # <--- FIXED: Added missing import
from google.colab import auth
from google.auth.transport.requests import Request
from google.oauth2 import credentials

# ==============================================================================
# CONFIGURATION
# ==============================================================================
FILE_ID = "1GuRdUMP5qrqyYN0gg8C2B6tLwJeigyFd"  # Your 154GB File ID
LOCAL_FILE = "/content/frames.tar"
MOUNT_POINT = "/content/yowo/data/ActionGenome/frames"
# ==============================================================================

def install_tools():
    print("üõ†Ô∏è Installing aria2 and ratarmount...")
    # -qq suppresses the wall of text
    subprocess.run(["apt-get", "install", "-y", "-qq", "aria2"], check=True)
    subprocess.run(["pip", "install", "-q", "ratarmount"], check=True)

def get_token():
    print("üîë Authenticating (to bypass Quota limits)...")
    auth.authenticate_user()
    # Get the raw token string to pass to aria2c
    creds, _ = google.auth.default()
    creds.refresh(Request())
    return creds.token

def download_fast(token):
    if os.path.exists(LOCAL_FILE):
        print(f"‚úÖ File already exists at {LOCAL_FILE}. Skipping download.")
        return

    print(f"\nüöÄ STARTING MULTI-THREADED DOWNLOAD (16x Streams)...")
    print("   This uses aria2c with your auth token. Max speed.")
    
    # URL for Drive API download
    url = f"https://www.googleapis.com/drive/v3/files/{FILE_ID}?alt=media"
    
    # aria2c command:
    # -x 16: 16 connections
    # -s 16: Split file into 16 parts
    # -j 16: Max concurrent downloads
    # --header: Pass the OAuth token
    cmd = [
        "aria2c", 
        "-x", "16", 
        "-s", "16", 
        "-j", "16",
        "--file-allocation=none", 
        "--summary-interval=10",
        "--header", f"Authorization: Bearer {token}", 
        "-o", os.path.basename(LOCAL_FILE),
        "-d", os.path.dirname(LOCAL_FILE),
        url
    ]
    
    # Stream output to console
    process = subprocess.Popen(cmd)
    process.wait()
    
    if process.returncode != 0:
        raise Exception("Download failed! Check quota or network.")
    print("\n‚úÖ Download Complete.")

def mount_archive():
    print(f"\nüîó Mounting {LOCAL_FILE} to {MOUNT_POINT}...")
    
    # Clean up previous mount
    subprocess.run(["fusermount", "-u", MOUNT_POINT], stderr=subprocess.DEVNULL)
    os.makedirs(MOUNT_POINT, exist_ok=True)
    
    # Mount with 4 threads for read speed
    index_file = LOCAL_FILE + ".index.sqlite"
    cmd = f"ratarmount -P 4 --index-file '{index_file}' '{LOCAL_FILE}' '{MOUNT_POINT}'"
    os.system(cmd)
    
    # Verify
    if len(os.listdir(MOUNT_POINT)) > 0:
        print(f"üéâ SUCCESS! {len(os.listdir(MOUNT_POINT))} items visible.")
        print(f"üëâ Data is ready at: {MOUNT_POINT}")
    else:
        print("‚ùå Mount failed. Folder is empty.")

# --- EXECUTION ---
try:
    # 1. Clear any existing Drive mounts
    try:
        from google.colab import drive
        drive.flush_and_unmount()
    except: pass
    
    install_tools()
    
    # 2. Get Token & Download
    token = get_token()
    download_fast(token)
    
    # 3. Mount
    mount_archive()
    
    # 4. Final Space Check
    print("\nüìä Storage Status:")
    os.system("df -h /content")

except Exception as e:
    print(f"\n‚ùå CRITICAL ERROR: {e}")

In [None]:
# Cell 5: Verify Dataset Structure
import os, pickle

ANN_DIR = "/content/yowo/data/ActionGenome/annotations"
FRAMES_DIR = "/content/yowo/data/ActionGenome/frames"

print("=" * 60)
print("üîç Dataset Verification")
print("=" * 60)

# Check required files
required_files = {
    'person_bbox.pkl': 'Person bounding boxes + keyframes',
    'object_bbox_and_relationship.pkl': 'Objects + relationships',
    'Charades_v1_train.csv': 'Training action labels',
    'Charades_v1_test.csv': 'Test action labels',
    'Charades_v1_classes.txt': '157 action classes',
    'object_classes.txt': '36 object classes',
    'relationship_classes.txt': '26 relationship classes',
    'video_fps.json': 'FPS for each video'
}

print("\nüìã Required Annotation Files:")
all_ok = True
for f, desc in required_files.items():
    path = os.path.join(ANN_DIR, f)
    if os.path.exists(path):
        size = os.path.getsize(path) / 1e6
        print(f"   ‚úÖ {f} ({size:.1f} MB) - {desc}")
    else:
        print(f"   ‚ùå {f} - MISSING! ({desc})")
        all_ok = False

# Check frames
print(f"\nüìÇ Frames Directory:")
if os.path.exists(FRAMES_DIR):
    num_videos = len(os.listdir(FRAMES_DIR))
    print(f"   ‚úÖ {num_videos} video directories")
    # Sample a video
    sample_vid = os.listdir(FRAMES_DIR)[0]
    sample_frames = len(os.listdir(os.path.join(FRAMES_DIR, sample_vid)))
    print(f"   üìÅ Sample: {sample_vid} has {sample_frames} frames")
else:
    print("   ‚ùå Frames directory missing!")
    all_ok = False

# Verify PKL files are valid
print(f"\nüî¨ Validating PKL Files:")
try:
    with open(os.path.join(ANN_DIR, 'person_bbox.pkl'), 'rb') as f:
        person_data = pickle.load(f)
    print(f"   ‚úÖ person_bbox.pkl: {len(person_data)} keyframes")
    
    with open(os.path.join(ANN_DIR, 'object_bbox_and_relationship.pkl'), 'rb') as f:
        obj_data = pickle.load(f)
    print(f"   ‚úÖ object_bbox_and_relationship.pkl: {len(obj_data)} entries")
except Exception as e:
    print(f"   ‚ùå Error reading PKL files: {e}")
    all_ok = False

if all_ok:
    print("\n" + "=" * 60)
    print("‚úÖ DATASET READY FOR TRAINING!")
    print("=" * 60)
else:
    print("\n" + "=" * 60)
    print("‚ö†Ô∏è DATASET INCOMPLETE - Check errors above")
    print("=" * 60)


In [None]:
# Cell 6: Understanding Training Output
# Training progress is shown via console output every 10 iterations
# For the multi-task model, you'll see:
#
# [Epoch: 1/10][Iter: 100/288782][lr: 0.0001]
# [loss_conf: 8.64][loss_obj: 3.56][loss_act: 24.52][loss_rel: 17.26]
# [loss_interact: 4.76][loss_box: 1.04][losses: 63.97][time: 4.71]

print("=" * 70)
print("üìä MULTI-TASK TRAINING OUTPUT GUIDE")
print("=" * 70)
print("""
The multi-task model outputs 6 separate losses:

| Loss | Description | Typical Start | Target |
|------|-------------|---------------|--------|
| loss_conf | Confidence/objectness | ~10-15 | ~3-5 |
| loss_obj | Object classification (36 classes) | ~3-4 | ~1-2 |
| loss_act | Action classification (157 classes) | ~15-25 | ~5-10 |
| loss_rel | Relationship classification (26 classes) | ~15-20 | ~3-5 |
| loss_interact | Interaction detection | ~4-5 | ~1-2 |
| loss_box | Bounding box regression | ~0.5-1.0 | ~0.3-0.5 |
| losses | TOTAL (sum of above) | ~50-70 | ~15-25 |

Note: loss_act may be 0.00 for some batches - this is NORMAL!
      Actions only apply to Person boxes, and some frames have no person.

Training time estimates (with AMP):
- T4 (Colab): bs=8, ~0.4 sec/iter, ~4 hours/epoch
- L4: bs=12, ~0.25 sec/iter, ~2.5 hours/epoch
- A100-40GB: bs=32, ~0.1 sec/iter, ~50 min/epoch
- A100-80GB: bs=64, ~0.05 sec/iter, ~30 min/epoch
""")
print("=" * 70)


## üöÄ Ready to Train!

**Model Architecture: `yowo_v2_x3d_m_yolo11m_multitask`**

| Component | Description |
|-----------|-------------|
| 2D Backbone | YOLO11m (pretrained on COCO) |
| 3D Backbone | X3D-M (pretrained on Kinetics-400) |
| Object Head | 36 classes (person + 35 objects) |
| Action Head | 157 Charades action classes |
| Relation Head | 26 relationship classes |
| Interaction Head | Binary (is object interacted with?) |

**Dataset: Charades + Action Genome**
- 288,782 annotated keyframes
- 9,601 videos
- Multi-task: Objects + Actions + Relationships

**Note:** Model checkpoints saved after each epoch to `/content/yowo/weights/charades_ag/`


In [None]:
# Cell 8: üöÄ TRAIN! (Main training cell)
# Batch size and accumulation are auto-configured from Cell 1
# AMP (Automatic Mixed Precision) enabled for ~1.5-2x faster training!

import os
os.chdir('/content/yowo')

# Training configuration
MAX_EPOCHS = 10          # Number of epochs (1 epoch = 288,782 iterations at bs=1)
LEARNING_RATE = 0.0001   # Base learning rate
LEN_CLIP = 16            # Number of frames per clip (temporal window)
NUM_WORKERS = 2          # DataLoader workers

# Build command with auto-configured batch size + AMP
cmd = f"""python train.py \
    -d charades_ag \
    -v yowo_v2_x3d_m_yolo11m_multitask \
    --cuda \
    --amp \
    -bs {BATCH_SIZE} \
    -accu {ACCUMULATE} \
    --max_epoch {MAX_EPOCHS} \
    --root /content/yowo/data \
    -K {LEN_CLIP} \
    -lr {LEARNING_RATE} \
    --num_workers {NUM_WORKERS} \
    --save_folder /content/yowo/weights"""

print("=" * 70)
print("üöÄ STARTING TRAINING")
print("=" * 70)
print(f"üì¶ Batch size: {BATCH_SIZE} (effective: {BATCH_SIZE*ACCUMULATE})")
print(f"üìä Epochs: {MAX_EPOCHS}")
print(f"üìà Learning rate: {LEARNING_RATE}")
print(f"üé¨ Clip length: {LEN_CLIP} frames")
print(f"‚ö° AMP: Enabled")
print(f"\nüìã Full command:\n{cmd}\n")
print("=" * 70 + "\n")

!{cmd}


In [None]:
# Cell 9: Save Weights to Google Drive (after training)
import shutil, os

DRIVE_SAVE_PATH = "/content/drive/MyDrive/yooowo/weights"
os.makedirs(DRIVE_SAVE_PATH, exist_ok=True)

weights_dir = "/content/yowo/weights/charades_ag/yowo_v2_x3d_m_yolo11m_multitask"
if os.path.exists(weights_dir):
    for w in os.listdir(weights_dir):
        if w.endswith('.pth'):
            shutil.copy2(os.path.join(weights_dir, w), os.path.join(DRIVE_SAVE_PATH, w))
            print(f"‚úÖ Saved {w} to Drive")
else:
    print("‚ö†Ô∏è No weights found yet")


## üß™ Optional: Quick 1-Epoch Test

Run this first to verify everything works before full training:


In [None]:
# Quick test - run ~100 iterations to verify everything works
# Uses small batch to ensure it fits, includes AMP
# Uncomment the line below to run:

# !python train.py -d charades_ag -v yowo_v2_x3d_m_yolo11m_multitask --cuda --amp -bs 4 --max_epoch 1 --root /content/yowo/data -K 16 --num_workers 2 2>&1 | head -80

# If it works, you should see losses decreasing every 10 iterations.
# Then run Cell 8 for full training.


## üìà Resume Training from Checkpoint


In [None]:
# Resume from checkpoint (uncomment and modify path)
# CHECKPOINT = "/content/yowo/weights/charades_ag/yowo_v2_x3d_m_yolo11m_multitask/yowo_v2_x3d_m_yolo11m_multitask_epoch_5.pth"
# !python train.py -d charades_ag -v yowo_v2_x3d_m_yolo11m_multitask --cuda -bs {BATCH_SIZE} -accu {ACCUMULATE} --max_epoch 20 --root /content/yowo/data -K 16 -r {CHECKPOINT} --eval


## üîß Troubleshooting

| Problem | Solution |
|---------|----------|
| **OOM Error** | Reduce `BATCH_SIZE` by 2, increase `ACCUMULATE` proportionally (keep effective same) |
| **Training slow** | Increase batch size if GPU memory allows. L4/A100 can go higher. |
| **Loss not decreasing** | Try lr=0.0005 (higher) or lr=0.00005 (lower) |
| **`loss is NAN !!`** | Reduce learning rate to 0.00005, or check for bad data samples |
| **Loss stuck high** | Verify dataset extracted correctly, check annotations |
| **loss_act = 0.00** | This is NORMAL - some frames have no person, so no action loss |

## üìÅ Output Files

After training:
- **Weights**: `/content/yowo/weights/charades_ag/yowo_v2_x3d_m_yolo11m_multitask/`
- **Checkpoints**: `yowo_v2_x3d_m_yolo11m_multitask_epoch_N.pth`

**‚ö†Ô∏è IMPORTANT:** Run Cell 9 to copy weights to Google Drive before the runtime disconnects!
