# üî¨ Autonomous Research Fleet ‚Äî Colab GPU Runner

This notebook is the **GPU compute layer** for the Research Agent Fleet.
It syncs code from Google Drive, runs training/evaluation, and writes results back.

**Do NOT edit this notebook manually** ‚Äî the agent fleet manages `src/` and reads `results/`.

## How it works
1. Mount Google Drive
2. Sync `src/` from Drive to Colab local
3. Install dependencies
4. Run training or evaluation
5. Sync `results/` back to Drive
6. Agent fleet picks up results from Drive

## 0. Configuration
Set your Google Drive project path here.

In [None]:
# ============================================================
# CONFIGURATION ‚Äî Set this to your Drive folder path
# ============================================================
DRIVE_PROJECT_ROOT = "research-fleet"  # relative to /content/drive/MyDrive/

# Derived paths (don't change)
DRIVE_BASE = f"/content/drive/MyDrive/{DRIVE_PROJECT_ROOT}"
LOCAL_WORKSPACE = "/content/workspace"

# What to run: 'train', 'evaluate', or 'both'
RUN_MODE = "both"

# Iteration number (read from orchestrator_state.json automatically)
AUTO_DETECT_ITERATION = True

## 1. Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')

import os
assert os.path.exists(DRIVE_BASE), f"Drive path not found: {DRIVE_BASE}\nPlease run colab_sync.sh push first."
print(f"‚úÖ Drive mounted. Project root: {DRIVE_BASE}")
!ls {DRIVE_BASE}/

## 2. Sync from Drive ‚Üí Colab Local

In [None]:
import shutil, json

# Create local workspace
os.makedirs(LOCAL_WORKSPACE, exist_ok=True)

# Sync src/
src_drive = os.path.join(DRIVE_BASE, "src")
src_local = os.path.join(LOCAL_WORKSPACE, "src")
if os.path.exists(src_local):
    shutil.rmtree(src_local)
shutil.copytree(src_drive, src_local)
print(f"‚úÖ Synced src/ ({len(os.listdir(src_local))} files)")

# Sync baselines/ if exists
baselines_drive = os.path.join(DRIVE_BASE, "baselines")
baselines_local = os.path.join(LOCAL_WORKSPACE, "baselines")
if os.path.exists(baselines_drive) and os.listdir(baselines_drive):
    if os.path.exists(baselines_local):
        shutil.rmtree(baselines_local)
    shutil.copytree(baselines_drive, baselines_local)
    print(f"‚úÖ Synced baselines/")

# Read iteration number
state_file = os.path.join(DRIVE_BASE, "logs", "orchestrator_state.json")
if AUTO_DETECT_ITERATION and os.path.exists(state_file):
    with open(state_file) as f:
        state = json.load(f)
    ITERATION = state.get("iteration", 1)
    print(f"‚úÖ Auto-detected iteration: {ITERATION}")
else:
    ITERATION = 1
    print(f"‚ö†Ô∏è Using default iteration: {ITERATION}")

# Create results directory for this iteration
results_dir = os.path.join(LOCAL_WORKSPACE, "results", f"iteration_{ITERATION:03d}")
os.makedirs(results_dir, exist_ok=True)
os.makedirs(os.path.join(results_dir, "checkpoints"), exist_ok=True)
print(f"‚úÖ Results dir: {results_dir}")

## 3. Install Dependencies

In [None]:
req_file = os.path.join(src_local, "requirements.txt")
if os.path.exists(req_file):
    !pip install -q -r {req_file}
    print("‚úÖ Dependencies installed")
else:
    print("‚ö†Ô∏è No requirements.txt found, using Colab defaults")

# Verify GPU
import torch
if torch.cuda.is_available():
    gpu_name = torch.cuda.get_device_name(0)
    gpu_mem = torch.cuda.get_device_properties(0).total_mem / 1e9
    print(f"‚úÖ GPU: {gpu_name} ({gpu_mem:.1f} GB)")
else:
    print("‚ùå No GPU! Go to Runtime ‚Üí Change runtime type ‚Üí GPU")

## 4. Environment Setup

In [None]:
# Patch config.py to use Colab paths
import sys
sys.path.insert(0, src_local)

# Set environment variables for the training scripts
os.environ["RESULTS_DIR"] = results_dir
os.environ["CHECKPOINT_DIR"] = os.path.join(results_dir, "checkpoints")
os.environ["DATA_DIR"] = os.path.join(LOCAL_WORKSPACE, "baselines", "data")
os.environ["ITERATION"] = str(ITERATION)

print(f"‚úÖ Environment ready")
print(f"   src:        {src_local}")
print(f"   results:    {results_dir}")
print(f"   iteration:  {ITERATION}")

## 5. Run Training

In [None]:
if RUN_MODE in ("train", "both"):
    print("="*60)
    print(f"  TRAINING ‚Äî Iteration {ITERATION}")
    print("="*60)
    !cd {src_local} && python train.py
    print("\n‚úÖ Training complete")
else:
    print("‚è≠Ô∏è Skipping training (RUN_MODE={RUN_MODE})")

## 6. Run Evaluation

In [None]:
if RUN_MODE in ("evaluate", "both"):
    print("="*60)
    print(f"  EVALUATION ‚Äî Iteration {ITERATION}")
    print("="*60)
    !cd {src_local} && python evaluate.py
    print("\n‚úÖ Evaluation complete")
else:
    print("‚è≠Ô∏è Skipping evaluation (RUN_MODE={RUN_MODE})")

## 7. Run Baselines (if baseline scripts exist)

In [None]:
baseline_runner = os.path.join(baselines_local, "run_baselines.py") if os.path.exists(baselines_local) else None
if baseline_runner and os.path.exists(baseline_runner):
    print("="*60)
    print(f"  BASELINES ‚Äî Iteration {ITERATION}")
    print("="*60)
    !cd {baselines_local} && python run_baselines.py
    print("\n‚úÖ Baselines complete")
else:
    print("‚è≠Ô∏è No baseline runner found")

## 8. Sync Results Back to Drive

In [None]:
# Copy results back to Drive
results_drive = os.path.join(DRIVE_BASE, "results", f"iteration_{ITERATION:03d}")
os.makedirs(results_drive, exist_ok=True)

# Sync all result files (but not large checkpoints by default)
import glob
synced = 0
for f in glob.glob(os.path.join(results_dir, "*")):
    if os.path.isfile(f) and not f.endswith(('.pt', '.pth', '.ckpt')):
        shutil.copy2(f, results_drive)
        synced += 1

# Also sync baseline results if any
baseline_results = os.path.join(baselines_local, "results") if os.path.exists(baselines_local) else None
if baseline_results and os.path.exists(baseline_results):
    baseline_results_drive = os.path.join(DRIVE_BASE, "baselines", "results")
    if os.path.exists(baseline_results_drive):
        shutil.rmtree(baseline_results_drive)
    shutil.copytree(baseline_results, baseline_results_drive)
    print(f"‚úÖ Baseline results synced to Drive")

# Write completion marker
marker = {
    "iteration": ITERATION,
    "status": "complete",
    "gpu": torch.cuda.get_device_name(0) if torch.cuda.is_available() else "none",
    "files_synced": synced
}
with open(os.path.join(results_drive, "_colab_complete.json"), "w") as f:
    json.dump(marker, f, indent=2)

print(f"\n‚úÖ Results synced to Drive ({synced} files)")
print(f"   Path: {results_drive}")
print(f"\nüéâ Colab run complete! Agent fleet can now pull results.")

## 9. Quick Results Preview

In [None]:
# Show results summary
test_results = os.path.join(results_dir, "test_results.json")
if os.path.exists(test_results):
    with open(test_results) as f:
        results = json.load(f)
    print("üìä Test Results:")
    for k, v in results.items():
        if isinstance(v, float):
            print(f"   {k}: {v:.4f}")
        else:
            print(f"   {k}: {v}")
else:
    print("‚ö†Ô∏è No test_results.json found")
    print("   Check training output above for errors.")