# üé¨ YOWO Multi-Task Training on Google Colab

**Model**: `yowo_v2_x3d_m_yolo11m_multitask`  
**Dataset**: Charades + Action Genome (288K keyframes, 219 classes)

### ‚ú® New Improvements (Dec 2024)
- **Soft-Argmax**: Differentiable position extraction for location-aware context
- **Action-Object Co-occurrence**: Learns which objects predict which actions
- **Learnable Temporal Attention**: Task-specific timestep weighting in X3D
- **Label Smoothing**: Optional regularization for rare classes
- **All Backbones Trainable**: YOLO11 + X3D fully fine-tuned (26.9M params)

### Optimized Batch Sizes (with AMP)

| GPU | VRAM | Batch | Accum | Effective | Est. Time/Epoch |
|-----|------|-------|-------|-----------|-----------------||
| T4 | 16GB | 8 | 4 | 32 | ~2.5 hours |
| L4 | 24GB | 14 | 4 | 56 | ~1.5 hours |
| V100 | 16GB | 10 | 4 | 40 | ~1.5 hours |
| A100 | 40GB | 28 | 2 | 56 | ~45 min |
| A100 | 80GB | 56 | 2 | 112 | ~25 min |
| H100 | 80GB | 80 | 2 | 160 | ~15 min |

**Features**: AMP (FP16), Multi-head (Objects + Actions + Relationships)

In [None]:
# Cell 1: Check GPU & Auto-Configure Batch Size
import torch
print("=" * 70)
print("üîç GPU Detection & Configuration")
print("=" * 70)

if not torch.cuda.is_available():
    raise RuntimeError("‚ùå No GPU! Go to Runtime > Change runtime type > GPU")

gpu_name = torch.cuda.get_device_name(0)
gpu_memory_gb = torch.cuda.get_device_properties(0).total_memory / 1e9

print(f"‚úÖ GPU: {gpu_name}")
print(f"‚úÖ VRAM: {gpu_memory_gb:.1f} GB")

# =============================================================================
# OPTIMIZED BATCH SIZES FOR YOWO V2 + X3D-M + YOLO11m WITH AMP
# Based on empirical testing of video action detection models
# AMP reduces memory by ~40%, allowing larger batches
# =============================================================================
if "A100" in gpu_name or "A100" in gpu_name.upper():
    if gpu_memory_gb > 45:  # A100 80GB
        BATCH_SIZE, ACCUMULATE = 64, 2   # Effective: 128 (can try 80 if stable)
    else:  # A100 40GB
        BATCH_SIZE, ACCUMULATE = 32, 2   # Effective: 64 (can try 40-48)
elif "H100" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 80, 2       # Effective: 160 (can try 96)
elif "L4" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 12, 4       # Effective: 48
elif "T4" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 8, 4        # Effective: 32 (can try 10)
elif "V100" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 10, 4       # Effective: 40
elif "P100" in gpu_name:
    BATCH_SIZE, ACCUMULATE = 6, 4        # Effective: 24
else:
    # Unknown GPU - use conservative settings based on memory
    if gpu_memory_gb >= 40:
        BATCH_SIZE, ACCUMULATE = 32, 2
    elif gpu_memory_gb >= 20:
        BATCH_SIZE, ACCUMULATE = 12, 4
    else:
        BATCH_SIZE, ACCUMULATE = 8, 4

effective = BATCH_SIZE * ACCUMULATE
print(f"\nüì¶ Optimized for {gpu_name}:")
print(f"   batch_size = {BATCH_SIZE}")
print(f"   accumulate = {ACCUMULATE}")
print(f"   effective_batch = {effective}")
print(f"\nüí° If OOM: reduce BATCH_SIZE by 2, increase ACCUMULATE proportionally")
print("=" * 70)

In [None]:
# Cell 3: Clone Repository & Install Dependencies
%cd /content
!rm -rf yowo
!git clone https://github.com/michelsedgh/yowo.git
%cd yowo
!pip install -q torch torchvision opencv-python thop scipy matplotlib numpy imageio pytorchvideo ultralytics tensorboard
print("‚úÖ Repository cloned and dependencies installed!")

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Cell 4: Download Annotations & Extract Frames
import os, time, requests, zipfile

DATA_ROOT = "/content/yowo/data/ActionGenome"
FRAMES_DIR = os.path.join(DATA_ROOT, "frames")
ANN_DIR = os.path.join(DATA_ROOT, "annotations")
TAR_PATH = "/content/drive/MyDrive/yooowo/frames.tar"

os.makedirs(ANN_DIR, exist_ok=True)

# =============================================================================
# STEP 1: Download Action Genome annotations (PKL files NOT in git repo!)
# =============================================================================
print("=" * 60)
print("üì• STEP 1: Downloading Action Genome Annotations")
print("=" * 60)

def download_file(url, filepath):
    if os.path.exists(filepath):
        size = os.path.getsize(filepath) / 1e6
        print(f"   ‚úÖ {os.path.basename(filepath)} exists ({size:.1f} MB)")
        return True
    print(f"   Downloading {os.path.basename(filepath)}...")
    try:
        response = requests.get(url, stream=True, timeout=120)
        if response.status_code == 200:
            with open(filepath, 'wb') as f:
                for chunk in response.iter_content(chunk_size=8192):
                    f.write(chunk)
            size = os.path.getsize(filepath) / 1e6
            print(f"   ‚úÖ Downloaded ({size:.1f} MB)")
            return True
    except Exception as e:
        print(f"   ‚ùå Failed: {e}")
    return False

# Action Genome annotations from STAR Benchmark S3
ag_files = {
    'object_bbox_and_relationship.pkl': 'https://star-benchmark.s3.us-east.cloud-object-storage.appdomain.cloud/Annotations/object_bbox_and_relationship.pkl',
    'person_bbox.pkl': 'https://star-benchmark.s3.us-east.cloud-object-storage.appdomain.cloud/Annotations/person_bbox.pkl',
    'classes.zip': 'https://star-benchmark.s3.us-east.cloud-object-storage.appdomain.cloud/Annotations/classes.zip'
}

for filename, url in ag_files.items():
    download_file(url, os.path.join(ANN_DIR, filename))

# Extract classes.zip if needed
classes_zip = os.path.join(ANN_DIR, 'classes.zip')
if os.path.exists(classes_zip) and not os.path.exists(os.path.join(ANN_DIR, 'object_classes.txt')):
    print("   Extracting classes.zip...")
    with zipfile.ZipFile(classes_zip, 'r') as z:
        z.extractall(ANN_DIR)
    # Move files from classes/ subdirectory if needed
    classes_subdir = os.path.join(ANN_DIR, 'classes')
    if os.path.exists(classes_subdir):
        import shutil
        for f in os.listdir(classes_subdir):
            shutil.move(os.path.join(classes_subdir, f), os.path.join(ANN_DIR, f))
        shutil.rmtree(classes_subdir)
    print("   ‚úÖ Extracted class files")


In [None]:
import os
import subprocess
import google.auth
from google.colab import auth
from google.auth.transport.requests import Request
from google.oauth2 import credentials

# ==============================================================================
# CONFIGURATION
# ==============================================================================
# 1. The Main Archive
TAR_FILE_ID = "1GuRdUMP5qrqyYN0gg8C2B6tLwJeigyFd"  
LOCAL_TAR = "/content/frames.tar"

# 2. The Pre-made Index (To save time!)
INDEX_FILE_ID = "1ecTAlWCWWSfSavneBwlALjhocl3LKXoa"
LOCAL_INDEX = "/content/frames.tar.index.sqlite"

# 3. Paths
# We mount the raw tar here first
TEMP_MOUNT_POINT = "/content/raw_mount" 
# We want the data to appear here eventually
FINAL_TARGET_DIR = "/content/yowo/data/ActionGenome/frames"
# ==============================================================================

def install_tools():
    print("üõ†Ô∏è Installing aria2 and ratarmount...")
    subprocess.run(["apt-get", "install", "-y", "-qq", "aria2"], check=True)
    subprocess.run(["pip", "install", "-q", "ratarmount"], check=True)

def get_token():
    print("üîë Authenticating...")
    auth.authenticate_user()
    creds, _ = google.auth.default()
    creds.refresh(Request())
    return creds.token

def download_file(token, file_id, output_path):
    if os.path.exists(output_path):
        print(f"‚úÖ Found existing file: {output_path}")
        return

    print(f"‚¨áÔ∏è Downloading {os.path.basename(output_path)}...")
    url = f"https://www.googleapis.com/drive/v3/files/{file_id}?alt=media"
    
    cmd = [
        "aria2c", "-x", "16", "-s", "16", "-j", "16",
        "--file-allocation=none", "--summary-interval=10",
        "--header", f"Authorization: Bearer {token}", 
        "-o", os.path.basename(output_path),
        "-d", os.path.dirname(output_path),
        url
    ]
    
    process = subprocess.Popen(cmd)
    process.wait()
    
    if process.returncode != 0:
        raise Exception(f"Failed to download {output_path}")

def mount_and_link():
    print(f"\nüîó Mounting archive to temp location: {TEMP_MOUNT_POINT}")
    
    # 1. Cleanup
    subprocess.run(["fusermount", "-u", TEMP_MOUNT_POINT], stderr=subprocess.DEVNULL)
    if os.path.islink(FINAL_TARGET_DIR):
        os.unlink(FINAL_TARGET_DIR)
    elif os.path.exists(FINAL_TARGET_DIR):
        # If it's an empty dir, remove it so we can link
        try: os.rmdir(FINAL_TARGET_DIR)
        except: pass

    os.makedirs(TEMP_MOUNT_POINT, exist_ok=True)
    
    # 2. Ratarmount using the downloaded index
    # We pass the index file explicitly
    cmd = f"ratarmount -P 4 --index-file '{LOCAL_INDEX}' '{LOCAL_TAR}' '{TEMP_MOUNT_POINT}'"
    exit_code = os.system(cmd)
    
    if exit_code != 0:
        raise Exception("Ratarmount failed!")

    # 3. Find the internal data path and Link it
    # Based on your error, the data is nested inside:
    nested_path = os.path.join(TEMP_MOUNT_POINT, "data/ActionGenome/frames")
    
    # Fallback: If that exact path doesn't exist, list folders to help debug
    if not os.path.exists(nested_path):
        print(f"‚ö†Ô∏è Could not find expected path: {nested_path}")
        print(f"üìÇ Contents of root mount: {os.listdir(TEMP_MOUNT_POINT)}")
        # Try to find 'frames' folder dynamically?
        # For now, let's assume the structure you mentioned is correct.
    
    # 4. Create the final destination link
    # Ensure parent dir exists
    parent_dir = os.path.dirname(FINAL_TARGET_DIR)
    os.makedirs(parent_dir, exist_ok=True)
    
    print(f"üîó Linking '{nested_path}' --> '{FINAL_TARGET_DIR}'")
    os.symlink(nested_path, FINAL_TARGET_DIR)
    
    # 5. Verify
    if os.path.exists(FINAL_TARGET_DIR) and len(os.listdir(FINAL_TARGET_DIR)) > 0:
        count = len(os.listdir(FINAL_TARGET_DIR))
        print(f"üéâ SUCCESS! {count} items visible at {FINAL_TARGET_DIR}")
    else:
        print("‚ùå Something went wrong. The target folder is empty.")

# --- EXECUTION ---
try:
    try:
        from google.colab import drive
        drive.flush_and_unmount()
    except: pass
    
    install_tools()
    token = get_token()
    
    # Download Tar AND Index
    download_file(token, TAR_FILE_ID, LOCAL_TAR)
    download_file(token, INDEX_FILE_ID, LOCAL_INDEX)
    
    mount_and_link()

except Exception as e:
    print(f"\n‚ùå CRITICAL ERROR: {e}")

In [None]:
# Cell 5: Verify Dataset Structure
import os, pickle

ANN_DIR = "/content/yowo/data/ActionGenome/annotations"
FRAMES_DIR = "/content/yowo/data/ActionGenome/frames"

print("=" * 60)
print("üîç Dataset Verification")
print("=" * 60)

# Check required files
required_files = {
    'person_bbox.pkl': 'Person bounding boxes + keyframes',
    'object_bbox_and_relationship.pkl': 'Objects + relationships',
    'Charades_v1_train.csv': 'Training action labels',
    'Charades_v1_test.csv': 'Test action labels',
    'Charades_v1_classes.txt': '157 action classes',
    'object_classes.txt': '36 object classes',
    'relationship_classes.txt': '26 relationship classes',
    'video_fps.json': 'FPS for each video'
}

print("\nüìã Required Annotation Files:")
all_ok = True
for f, desc in required_files.items():
    path = os.path.join(ANN_DIR, f)
    if os.path.exists(path):
        size = os.path.getsize(path) / 1e6
        print(f"   ‚úÖ {f} ({size:.1f} MB) - {desc}")
    else:
        print(f"   ‚ùå {f} - MISSING! ({desc})")
        all_ok = False

# Check frames
print(f"\nüìÇ Frames Directory:")
if os.path.exists(FRAMES_DIR):
    num_videos = len(os.listdir(FRAMES_DIR))
    print(f"   ‚úÖ {num_videos} video directories")
    # Sample a video
    sample_vid = os.listdir(FRAMES_DIR)[0]
    sample_frames = len(os.listdir(os.path.join(FRAMES_DIR, sample_vid)))
    print(f"   üìÅ Sample: {sample_vid} has {sample_frames} frames")
else:
    print("   ‚ùå Frames directory missing!")
    all_ok = False

# Verify PKL files are valid
print(f"\nüî¨ Validating PKL Files:")
try:
    with open(os.path.join(ANN_DIR, 'person_bbox.pkl'), 'rb') as f:
        person_data = pickle.load(f)
    print(f"   ‚úÖ person_bbox.pkl: {len(person_data)} keyframes")
    
    with open(os.path.join(ANN_DIR, 'object_bbox_and_relationship.pkl'), 'rb') as f:
        obj_data = pickle.load(f)
    print(f"   ‚úÖ object_bbox_and_relationship.pkl: {len(obj_data)} entries")
except Exception as e:
    print(f"   ‚ùå Error reading PKL files: {e}")
    all_ok = False

if all_ok:
    print("\n" + "=" * 60)
    print("‚úÖ DATASET READY FOR TRAINING!")
    print("=" * 60)
else:
    print("\n" + "=" * 60)
    print("‚ö†Ô∏è DATASET INCOMPLETE - Check errors above")
    print("=" * 60)

## üöÄ Ready to Train!

**Model Architecture: `yowo_v2_x3d_m_yolo11m_multitask`**

| Component | Description |
|-----------|-------------|
| 2D Backbone | YOLO11m (pretrained on COCO, **TRAINABLE**) |
| 3D Backbone | X3D-M with **Learnable Temporal Attention** |
| Object Head | 36 classes (person + 35 objects) |
| Action Head | 157 Charades classes + **Action-Object Co-occurrence** |
| Relation Head | 26 relationship classes |
| Context | **Soft-Argmax** for differentiable positions |

**New Features (Dec 2024)**
- ‚úÖ All 26.9M params trainable (backbones unfrozen by default)
- ‚úÖ Soft-argmax for gradient flow through positions
- ‚úÖ Action-Object co-occurrence learning
- ‚úÖ Learnable temporal attention in X3D
- ‚úÖ Optional label smoothing

**Dataset: Charades + Action Genome**
- 288,782 annotated keyframes
- 9,601 videos
- Multi-task: Objects + Actions + Relationships

**Note:** Model checkpoints saved after each epoch to `/content/yowo/weights/charades_ag/`

In [None]:
# =============================================================================
# üöÄ IMPROVED TRAINING CELL (Respects GPU Auto-Detection)
# =============================================================================
import os
os.chdir('/content/yowo')

# CONFIGURABLE PARAMETERS
# -----------------------------------------------------------------------------
# Use auto-detected batch size from Cell 1 if it exists, otherwise use safe defaults
try:
    current_bs = BATCH_SIZE
    current_acc = ACCUMULATE
    print(f"‚úÖ Using detected batch size: {current_bs} (accumulate={current_acc})")
except NameError:
    current_bs = 8
    current_acc = 4
    print(f"‚ö†Ô∏è GPU Detection not found, using conservative defaults: {current_bs}")

MAX_EPOCHS = 15
LEN_CLIP = 16
NUM_WORKERS = 4

# Learning Rate: Heads get this, Backbones get 1/10th of this (0.00002)
LEARNING_RATE = 0.0003    
LR_DECAY_EPOCHS = "8 10 12"
LABEL_SMOOTHING = 0.05    

# Backbone Training
FREEZE_2D = False         
FREEZE_3D = False         

# RESUME: Set to path or None
RESUME_CHECKPOINT = None

# BUILD COMMAND
# -----------------------------------------------------------------------------
cmd = f"""python train.py \
    -d charades_ag \
    -v yowo_v2_x3d_m_yolo11m_multitask \
    --cuda \
    --amp \
    --eval \
    -bs {current_bs} \
    -accu {current_acc} \
    --max_epoch {MAX_EPOCHS} \
    --lr_epoch {LR_DECAY_EPOCHS} \
    --root /content/yowo/data \
    -K {LEN_CLIP} \
    -lr {LEARNING_RATE} \
    --label_smoothing {LABEL_SMOOTHING} \
    --num_workers {NUM_WORKERS} \
    --save_folder /content/yowo/weights"""

if FREEZE_2D: cmd += " --freeze_backbone_2d"
if FREEZE_3D: cmd += " --freeze_backbone_3d"
if RESUME_CHECKPOINT: cmd += f" -r {RESUME_CHECKPOINT}"

# Trainable params helper (YOLO11m is ~20M, X3D-M is ~3M, Heads are ~4M)
trainable_m = 26.9
if FREEZE_2D: trainable_m -= 20.1
if FREEZE_3D: trainable_m -= 3.0

print("=" * 70)
print(f"üöÄ TRAINING CONFIGURATION")
print("=" * 70)
print(f"üì¶ Batch: {current_bs} √ó {current_acc} = {current_bs * current_acc} effective")
print(f"üìà LR: {LEARNING_RATE} (Heads) / {LEARNING_RATE*0.1:.6f} (Backbones)")
print(f"üß† Total Trainable Params: ~{trainable_m:.1f}M")
print(f"üéØ Features: Soft-Argmax, Co-occurrence, Temporal Attention")
print(f"\nüìã Full Command:\n{cmd}\n")
print("=" * 70 + "\n")

!{cmd}