# üöÄ Agent Fleet ‚Äî Colab GPU Runner with Guardian

**Coordinated GPU execution for the Autonomous Research Agent Fleet**

This notebook:
1. Syncs code from Google Drive (pushed by Implementer)
2. Runs Guardian pre-flight validation
3. Trains model on GPU
4. Syncs results back to Drive (for Watchdog audit)
5. Writes completion marker for Orchestrator

**Do not edit manually** ‚Äî managed by agent fleet

## Step 1: Mount Drive & Read Configuration

In [None]:
# Mount Google Drive
from google.colab import drive
import os
import json
import shutil
from datetime import datetime

drive.mount('/content/drive')
print("‚úÖ Google Drive mounted")

# Configuration from agent fleet
DRIVE_PROJECT_ROOT = "research-fleet"  # Must match colab_sync.sh config
DRIVE_BASE = f"/content/drive/MyDrive/{DRIVE_PROJECT_ROOT}"
LOCAL_WORKSPACE = "/content/workspace"

assert os.path.exists(DRIVE_BASE), f"‚ùå Drive path not found: {DRIVE_BASE}\nRun: ./scripts/colab_sync.sh push"
print(f"‚úÖ Project root: {DRIVE_BASE}")

## Step 2: Sync from Drive ‚Üí Colab Local

In [None]:
# Create local workspace
os.makedirs(LOCAL_WORKSPACE, exist_ok=True)

# Read iteration number from orchestrator state
state_file = os.path.join(DRIVE_BASE, "logs", "orchestrator_state.json")
if os.path.exists(state_file):
    with open(state_file) as f:
        state = json.load(f)
    ITERATION = state.get("iteration", 1)
    print(f"‚úÖ Read iteration from orchestrator: {ITERATION}")
else:
    ITERATION = 1
    print(f"‚ö†Ô∏è No orchestrator_state.json, using default iteration: {ITERATION}")

# Sync src/ from Drive
src_drive = os.path.join(DRIVE_BASE, "src")
src_local = os.path.join(LOCAL_WORKSPACE, "src")
if os.path.exists(src_local):
    shutil.rmtree(src_local)
shutil.copytree(src_drive, src_local)
print(f"‚úÖ Synced src/ ({len(os.listdir(src_local))} files)")

# Sync baselines/ if present
baselines_drive = os.path.join(DRIVE_BASE, "baselines")
baselines_local = os.path.join(LOCAL_WORKSPACE, "baselines")
if os.path.exists(baselines_drive):
    if os.path.exists(baselines_local):
        shutil.rmtree(baselines_local)
    shutil.copytree(baselines_drive, baselines_local)
    print(f"‚úÖ Synced baselines/")

# Create results directory for this iteration
results_dir = os.path.join(LOCAL_WORKSPACE, "results", f"iteration_{ITERATION:03d}")
os.makedirs(results_dir, exist_ok=True)
os.makedirs(os.path.join(results_dir, "checkpoints"), exist_ok=True)
print(f"‚úÖ Results dir: {results_dir}")

## Step 3: Check GPU & Install Dependencies

In [None]:
import torch

print("="*80)
print("GPU VERIFICATION")
print("="*80)

if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1e9
    print(f"‚úÖ GPU: {gpu_name} ({gpu_mem:.1f} GB)")
else:
    print("‚ùå NO GPU! Runtime ‚Üí Change runtime type ‚Üí GPU")
    raise RuntimeError("GPU required")

# Install requirements
req_file = os.path.join(src_local, "requirements.txt")
if os.path.exists(req_file):
    !pip install -q -r {req_file}
    print("‚úÖ Dependencies installed")

## Step 4: üõ°Ô∏è RUN GUARDIAN VALIDATION (Watchdog Gate #1)

In [None]:
import subprocess
import sys

print("\n" + "="*80)
print("GUARDIAN VALIDATION (Watchdog Pre-flight Gate)")
print("="*80)

result = subprocess.run(
    ["python3", "workspace/src/guardian.py"],
    cwd="/content",
    capture_output=True,
    text=True,
    env={**os.environ, "PYTHONPATH": src_local}
)

print(result.stdout)

if result.returncode != 0:
    print("\n‚ùå GUARDIAN FAILED")
    print(result.stderr)
    # Still write failure marker so orchestrator knows
    failure_marker = {
        "iteration": ITERATION,
        "status": "guardian_failed",
        "error": result.stderr[:500],
        "timestamp": datetime.now().isoformat()
    }
    results_drive = os.path.join(DRIVE_BASE, "results", f"iteration_{ITERATION:03d}")
    os.makedirs(results_drive, exist_ok=True)
    with open(os.path.join(results_drive, "_colab_guardian_failed.json"), "w") as f:
        json.dump(failure_marker, f, indent=2)
    raise RuntimeError("Guardian validation failed. See output above.")
else:
    print("\n‚úÖ GUARDIAN PASSED ‚Äî Ready for training")

## Step 5: Environment Setup

In [None]:
sys.path.insert(0, src_local)

# Set environment variables
os.environ["RESULTS_DIR"] = results_dir
os.environ["CHECKPOINT_DIR"] = os.path.join(results_dir, "checkpoints")
os.environ["ITERATION"] = str(ITERATION)
os.environ["PYTHONPATH"] = src_local

print(f"‚úÖ Colab environment ready")
print(f"   Iteration:     {ITERATION}")
print(f"   Results dir:   {results_dir}")
print(f"   Local src:     {src_local}")

## Step 6: Import & Configure Training

In [None]:
from config import DEVICE, NUM_EPOCHS, BATCH_SIZE, LAMBDA_CONSISTENCY
from model import create_encoder
from data import create_data_loaders
from train import Trainer
from evaluate import Evaluator

print("="*80)
print("TRAINING CONFIGURATION")
print("="*80)
print(f"Device:             {DEVICE}")
print(f"Num Epochs:         {NUM_EPOCHS}")
print(f"Batch Size:         {BATCH_SIZE}")
print(f"Lambda Consistency: {LAMBDA_CONSISTENCY}")
print("\n‚úÖ All imports successful!")

## Step 7: Run Training

In [None]:
print("\n" + "="*80)
print(f"TRAINING ‚Äî Iteration {ITERATION}")
print("="*80)

# Load data
print("\n[1/4] Loading data...")
train_loader, val_loader, test_loader = create_data_loaders(
    dataset_name="synthetic",  # Will implement BETA loading later
    batch_size=BATCH_SIZE,
)

# Create model
print("[2/4] Creating model...")
model = create_encoder(encoder_type="cnn")

# Train
print(f"[3/4] Training on {DEVICE} ({NUM_EPOCHS} epochs)...")
trainer = Trainer(
    model=model,
    train_loader=train_loader,
    val_loader=val_loader,
    test_loader=test_loader,
    lambda_consistency=LAMBDA_CONSISTENCY,
    num_epochs=NUM_EPOCHS,
    device=DEVICE,
    checkpoint_dir=os.path.join(results_dir, "checkpoints"),
    log_dir=results_dir,
)

history = trainer.train()

# Evaluate
print("\n[4/4] Evaluating...")
evaluator = Evaluator(model, test_loader, device=DEVICE)
metrics = evaluator.evaluate()

print("\n" + "="*80)
print("TRAINING COMPLETE")
print("="*80)
print(f"Best Val Accuracy:  {history['best_val_accuracy']:.4f}")
print(f"Test Accuracy:      {metrics['accuracy']:.4f}")
print(f"Test F1 Score:      {metrics['f1_score']:.4f}")
print(f"Test ITR:           {metrics['itr']:.2f} bits/min")

## Step 8: Save Results (Watchdog Input)

In [None]:
print("\n" + "="*80)
print("SAVING RESULTS")
print("="*80)

# Standardized results format for watchdog audit
results_data = {
    "iteration": ITERATION,
    "timestamp": datetime.now().isoformat(),
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "none",
    "device": DEVICE,
    "training": {
        "best_val_accuracy": float(history["best_val_accuracy"]),
        "best_epoch": int(history["best_epoch"]),
        "num_epochs": NUM_EPOCHS,
        "batch_size": BATCH_SIZE,
        "lambda_consistency": LAMBDA_CONSISTENCY,
    },
    "test_metrics": {
        "accuracy": float(metrics["accuracy"]),
        "f1_score": float(metrics["f1_score"]),
        "itr": float(metrics["itr"]),
        "within_class_distance": float(metrics.get("within_class_distance", 0)),
        "between_class_distance": float(metrics.get("between_class_distance", 0)),
        "consistency_ratio": float(metrics.get("consistency_ratio", 0)),
    }
}

# Save test results
test_results_file = os.path.join(results_dir, "test_results.json")
with open(test_results_file, "w") as f:
    json.dump(results_data, f, indent=2)
print(f"‚úÖ Test results: {test_results_file}")

# Also save training history
history_file = os.path.join(results_dir, "training_history.json")
with open(history_file, "w") as f:
    json.dump(history, f, indent=2)
print(f"‚úÖ Training history: {history_file}")

## Step 9: Sync Results Back to Drive

In [None]:
# Sync results directory back to Drive
results_drive = os.path.join(DRIVE_BASE, "results", f"iteration_{ITERATION:03d}")
os.makedirs(results_drive, exist_ok=True)

# Copy all result files (except large checkpoints)
import glob
for f in glob.glob(os.path.join(results_dir, "*.json")):
    shutil.copy2(f, results_drive)
    print(f"‚úÖ Synced {os.path.basename(f)}")

# Copy best checkpoint if it exists
best_ckpt = os.path.join(results_dir, "checkpoints", "best_model.pt")
if os.path.exists(best_ckpt):
    shutil.copy2(best_ckpt, os.path.join(results_drive, "best_model.pt"))
    print(f"‚úÖ Synced best_model.pt")

print(f"\nüìÇ Results synced to Drive: {results_drive}")

## Step 10: Write Completion Marker (for Orchestrator)

In [None]:
# Write completion marker so orchestrator/watchdog knows training finished
completion_marker = {
    "iteration": ITERATION,
    "status": "complete",
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "none",
    "timestamp": datetime.now().isoformat(),
    "best_val_accuracy": float(history["best_val_accuracy"]),
    "test_accuracy": float(metrics["accuracy"]),
}

marker_file = os.path.join(results_drive, "_colab_complete.json")
with open(marker_file, "w") as f:
    json.dump(completion_marker, f, indent=2)

print(f"\n‚úÖ Completion marker written: {marker_file}")
print(f"\nüéâ Colab run complete!")
print(f"   Orchestrator/Watchdog can now audit results.")
print(f"   Run locally: ./scripts/colab_sync.sh pull")

## Summary

In [None]:
print("\n" + "="*80)
print("COLAB EXECUTION SUMMARY")
print("="*80)
print(f"""
‚úÖ Guardian Validation:     PASSED
‚úÖ Training:                COMPLETE
‚úÖ Results Saved:           {results_dir}
‚úÖ Drive Sync:              COMPLETE
‚úÖ Completion Marker:       WRITTEN

üìä Final Metrics:
   Best Val Accuracy:  {history['best_val_accuracy']:.4f}
   Test Accuracy:      {metrics['accuracy']:.4f}
   Test F1 Score:      {metrics['f1_score']:.4f}
   Test ITR:           {metrics['itr']:.2f} bits/min

üîÑ Next Steps:
   1. Close this notebook
   2. Run locally: ./scripts/colab_sync.sh pull
   3. Orchestrator will run Watchdog audits
   4. Check: workspace/logs/watchdog_report_*.json

üìÇ All files on Drive: {results_drive}
""")
print("="*80)