In [None]:
# Process ALL AssistantBench browser agent files
import os, json, zipfile, io, tempfile, re, base64
from typing import Iterator, Dict, Any, Optional, List, Tuple
from huggingface_hub import HfFileSystem
from tqdm import tqdm
import ijson
from cryptography.fernet import Fernet
from cryptography.hazmat.primitives import hashes
from cryptography.hazmat.primitives.kdf.pbkdf2 import PBKDF2HMAC

from pydantic import Field

from docent.data_models import AgentRun, Transcript
from docent.data_models.chat import ChatMessage, ToolCall, parse_chat_message
from docent import Docent

REPO_ID  = "agent-evals/hal_traces"
REVISION = "main"

# Set token from environment variable
hf_token = os.getenv("HF_TOKEN")
if not hf_token:
    raise ValueError("HF_TOKEN environment variable is required")
os.environ["HF_TOKEN"] = hf_token

In [5]:
# Helper functions from your working demo (FIXED to include all roles)
_CODE_FENCE_RE = re.compile(r"^\s*```[\w+-]*\n(.*?)\n```$", re.DOTALL)

def _canon_text(s: str) -> str:
    """Strip code fences, normalize whitespace so identical answers hash the same."""
    if not s:
        return ""
    s = s.strip()
    m = _CODE_FENCE_RE.match(s)
    if m:
        s = m.group(1)
    lines = [ln.rstrip() for ln in s.splitlines()]
    out, prev_blank = [], False
    for ln in lines:
        blank = (ln == "")
        if blank and prev_blank:
            continue
        out.append(ln)
        prev_blank = blank
    return "\n".join(out).strip()

def _tc_canon(tc):
    typ = (tc.get("type") or "function")
    fn  = tc.get("function")
    if isinstance(fn, dict):
        fn = fn.get("name")
    if fn is None:
        fn = ""
    args = tc.get("arguments", {})
    if isinstance(args, (dict, list)):
        args_canon = json.dumps(args, sort_keys=True, separators=(",", ":"), default=str)
    else:
        args_canon = str(args)
    return (typ, str(fn), args_canon)

def _msg_fingerprint(m: Dict[str, Any]) -> tuple:
    """Fingerprint of a normalized chat message."""
    role = m.get("role")
    content = _canon_text(m.get("content") or "")
    tool_calls = tuple(sorted(_tc_canon(tc) for tc in (m.get("tool_calls") or []) if isinstance(tc, dict)))
    return (role, content, tool_calls)

def dedupe_messages(messages, mode: str = "consecutive"):
    """Drop duplicate messages."""
    out = []
    last_fp = None
    seen = set()
    for m in messages:
        fp = _msg_fingerprint(m)
        if mode == "consecutive":
            if fp == last_fp:
                continue
            last_fp = fp
        else:  # global
            if fp in seen:
                continue
            seen.add(fp)
        out.append(m)
    return out

def _collapse_content_to_text(content) -> str:
    if isinstance(content, str):
        return content
    if isinstance(content, list):
        parts = []
        for seg in content:
            if isinstance(seg, dict) and isinstance(seg.get("text"), str):
                parts.append(seg["text"])
        return "\n".join(p for p in parts if p)
    return str(content)

def _map_role(raw_type, raw__type):
    t = (raw_type or raw__type or "").lower()
    if t in ("ai", "assistant"): return "assistant"
    if t in ("human", "user"):   return "user"
    if t == "system":            return "system"
    return None

def normalize_weave_log_item(item: Dict[str, Any]):
    """Prefer the 'inputs.raw' single-message rows."""
    raw = item.get("inputs", {}).get("raw")
    if not isinstance(raw, dict):
        return None
    role = _map_role(raw.get("type"), raw.get("_type"))
    if role is None:
        return None
    content_text = _collapse_content_to_text(raw.get("content"))
    tool_calls = raw.get("tool_calls") or []
    ts = item.get("started_at") or item.get("created_timestamp")
    return {
        "role": role,
        "content": content_text,
        "tool_calls": [
            {
                "id": tc.get("id"),
                "function": (tc.get("name") or (tc.get("function") or {}).get("name")),
                "arguments": (tc.get("args")  or (tc.get("function") or {}).get("arguments", {})),
                "type": tc.get("type") or "function",
            }
            for tc in tool_calls if isinstance(tc, dict)
        ],
        "ts": ts,
    }

def normalize_assistant_output(item: Dict[str, Any]):
    """Pick up OpenAI-style assistant messages from the 'output' side."""
    out = item.get("output") or {}
    choices = out.get("choices") or []
    if not choices:
        return None
    msg = choices[0].get("message") or {}
    content = msg.get("content")
    if not content:
        return None
    return {
        "role": "assistant",
        "content": content if isinstance(content, str) else _collapse_content_to_text(content),
        "tool_calls": [],
        "ts": item.get("ended_at") or item.get("created_timestamp"),
    }

def _maybe_take_initial_system_msgs(item: Dict[str, Any]):
    """Capture SYSTEM messages from inputs.messages once per task."""
    msgs = item.get("inputs", {}).get("messages")
    out = []
    if isinstance(msgs, list):
        for m in msgs:
            if isinstance(m, dict) and m.get("role") == "system":
                out.append({
                    "role": "system",
                    "content": _collapse_content_to_text(m.get("content")),
                    "tool_calls": [],
                    "ts": item.get("started_at") or item.get("created_timestamp"),
                })
    return out

def _build_agent_run_from_bucket(tid: str, bucket, model, eval_blob=None):
    # FIXED: Include messages with empty content (important for system/user messages)
    # Old version filtered: [m for m in bucket if m.get("role") and m.get("content") is not None]
    # New version includes all messages with a role, even if content is empty
    msgs_sorted = sorted([m for m in bucket if m.get("role")], 
                         key=lambda m: m.get("ts") or "")
    
    # strip ts from final output
    final_messages = []
    for m in msgs_sorted:
        mm = {"role": m["role"], "content": m.get("content") or ""}  # Default to empty string
        if m.get("tool_calls"):
            mm["tool_calls"] = m["tool_calls"]
        final_messages.append(mm)

    agent_run = {
        "weave_task_id": tid,
        "model": model,
        "messages": final_messages,
    }
    if eval_blob:
        agent_run["eval"] = {
            "reward": eval_blob.get("reward"),
            "task": eval_blob.get("task", eval_blob.get("info", {})) or {},
        }
    
    agent_run["messages"] = dedupe_messages(agent_run["messages"])
    return agent_run

print("FIXED helper functions loaded - now preserves all message roles including empty content!")

FIXED helper functions loaded - now preserves all message roles including empty content!


In [6]:
# Decryption functions (FIXED for proper message ordering)
def _derive_key(password: str, salt: bytes) -> bytes:
    kdf = PBKDF2HMAC(algorithm=hashes.SHA256(), length=32, salt=salt, iterations=480000)
    return base64.urlsafe_b64encode(kdf.derive(password.encode()))

def decrypt_token_bytes(encrypted_data_b64: str, salt_b64: str, password: str = "hal1234") -> bytes:
    ct = base64.b64decode(encrypted_data_b64)
    salt = base64.b64decode(salt_b64)
    f = Fernet(_derive_key(password, salt))
    return f.decrypt(ct)

def _decrypt_container_to_tempfile(container: Dict[str, Any]) -> str:
    plaintext = decrypt_token_bytes(container["encrypted_data"], container["salt"])
    tf = tempfile.NamedTemporaryFile(delete=False, suffix=".json")
    try:
        tf.write(plaintext)
        tf.flush()
        return tf.name
    finally:
        tf.close()

def extract_contextual_messages_from_item(item: Dict[str, Any]) -> List[Dict[str, Any]]:
    """Extract user, system, and assistant messages that are contextual to this specific log item"""
    messages = []
    ts = item.get("started_at") or item.get("created_timestamp")
    
    # Check inputs.messages for contextual user/system messages
    input_messages = item.get("inputs", {}).get("messages", [])
    if isinstance(input_messages, list):
        for msg in input_messages:
            if isinstance(msg, dict):
                role = msg.get("role")
                if role in ["user", "system"]:
                    content = _collapse_content_to_text(msg.get("content"))
                    # Include the message if it has content OR if it's a system message
                    if content or role == "system":
                        messages.append({
                            "role": role,
                            "content": content,
                            "tool_calls": [],
                            "ts": ts
                        })
    
    # Also check for assistant messages in inputs.messages (some might be there)
    for msg in input_messages:
        if isinstance(msg, dict) and msg.get("role") == "assistant":
            content = _collapse_content_to_text(msg.get("content"))
            if content:  # Only include assistant messages with content
                messages.append({
                    "role": "assistant", 
                    "content": content,
                    "tool_calls": msg.get("tool_calls", []),
                    "ts": ts
                })
    
    return messages

def stream_agent_runs_by_task(repo_id: str, zip_name: str, *, revision: str = "main", 
                             repo_kind: str = "datasets", member_name=None,
                             require_model=None, include_eval: bool = False, 
                             limit=None, aggregate_all: bool = True):
    # Open ZIP from HF
    fs = HfFileSystem()
    zip_path = f"{repo_kind}/{repo_id}@{revision}/{zip_name}"
    hf_file = fs.open(zip_path, "rb")
    zf = zipfile.ZipFile(hf_file)
    if member_name:
        info = zf.getinfo(member_name)
    else:
        info = next(i for i in zf.infolist() if not i.filename.endswith("/"))
    
    try:
        with zf.open(info, "r") as member:
            container = json.load(member)
    finally:
        try: zf.close()
        except: pass
        try: hf_file.close()
        except: pass

    plaintext_path = _decrypt_container_to_tempfile(container)

    tasks_bucket = {}
    model_by_tid = {}
    eval_by_tid = {}
    produced = 0
    
    try:
        with open(plaintext_path, "rb") as f:
            for item in ijson.items(f, "raw_logging_results.item"):
                tid = item.get("weave_task_id")
                if not tid:
                    continue

                # Model discovery
                mdl = (item.get("inputs", {}) or {}).get("model") or (item.get("output", {}) or {}).get("model")
                if mdl:
                    model_by_tid[tid] = mdl

                # Eval blob
                if ("reward" in item) or ("task" in item) or ("info" in item):
                    eval_by_tid[tid] = item
                    continue

                # Filter by model if requested
                if require_model and ((item.get("inputs", {}) or {}).get("model") != require_model):
                    continue

                # Start bucket
                if tid not in tasks_bucket:
                    tasks_bucket[tid] = []

                # FIXED: Extract ALL contextual messages from this item (user, system, assistant)
                # This ensures proper timestamps and ordering
                contextual_messages = extract_contextual_messages_from_item(item)
                tasks_bucket[tid].extend(contextual_messages)

                # EXISTING: Normalize individual messages from inputs.raw
                nm = normalize_weave_log_item(item)
                if nm:
                    tasks_bucket[tid].append(nm)

                # EXISTING: Assistant messages from outputs
                ao = normalize_assistant_output(item)
                if ao:
                    tasks_bucket[tid].append(ao)

        # Emit once per task
        for tid, bucket in tasks_bucket.items():
            run = _build_agent_run_from_bucket(
                tid=tid,
                bucket=bucket,
                model=model_by_tid.get(tid),
                eval_blob=eval_by_tid.get(tid) if include_eval else None,
            )
            yield tid, run
            produced += 1
            if limit and produced >= limit:
                break
    finally:
        try: os.remove(plaintext_path)
        except OSError: pass

print("FIXED message processing - now extracts contextual messages per item for proper ordering!")

FIXED message processing - now extracts contextual messages per item for proper ordering!


In [None]:
# Convert loaded_results to docent ChatMessage format
def normalize_message_for_docent(msg):
    """Normalize AssistantBench message format to docent-compatible format"""
    normalized = msg.copy()
    
    # Fix tool_calls format if present
    if 'tool_calls' in normalized and normalized['tool_calls']:
        fixed_tool_calls = []
        for tool_call in normalized['tool_calls']:
            fixed_tc = {}
            
            # Required fields for docent ToolCall
            fixed_tc['id'] = tool_call.get('id', f"tool_{len(fixed_tool_calls)}")  # Ensure ID exists
            fixed_tc['type'] = 'function'  # docent expects 'function'
            
            # Function name - should be a string, not a dict
            if isinstance(tool_call.get('function'), str):
                fixed_tc['function'] = tool_call['function']
            elif isinstance(tool_call.get('function'), dict):
                # If it's a dict with 'name', use that
                fixed_tc['function'] = tool_call['function'].get('name', 'unknown_function')
            else:
                fixed_tc['function'] = 'unknown_function'
            
            # Arguments - should be a dict at the top level
            if 'arguments' in tool_call:
                fixed_tc['arguments'] = tool_call['arguments']
            elif isinstance(tool_call.get('function'), dict) and 'arguments' in tool_call['function']:
                fixed_tc['arguments'] = tool_call['function']['arguments']
            else:
                fixed_tc['arguments'] = {}
            
            fixed_tool_calls.append(fixed_tc)
        
        normalized['tool_calls'] = fixed_tool_calls
    
    return normalized

def convert_to_docent_messages(loaded_results):
    """Convert AssistantBench results to docent ChatMessage format"""
    docent_results = {}
    
    print("Converting to docent ChatMessage format...")
    conversion_stats = {
        'total_messages': 0,
        'successful': 0,
        'failed': 0,
        'failed_tasks': [],
        'error_types': {}
    }
    
    for zip_name, tasks in loaded_results.items():
        if "error" in tasks:
            docent_results[zip_name] = tasks  # Keep error info
            continue
            
        docent_results[zip_name] = {}
        
        for task_id, agent_run in tasks.items():
            messages = agent_run.get('messages', [])
            conversion_stats['total_messages'] += len(messages)
            
            # Convert each message to docent ChatMessage
            docent_messages = []
            task_failed_count = 0
            
            for i, msg in enumerate(messages):
                try:
                    # Normalize the message format first
                    normalized_msg = normalize_message_for_docent(msg)
                    
                    # Parse using docent's parse_chat_message function
                    chat_msg = parse_chat_message(normalized_msg)
                    docent_messages.append(chat_msg)
                    conversion_stats['successful'] += 1
                    
                except Exception as e:
                    error_type = type(e).__name__
                    conversion_stats['error_types'][error_type] = conversion_stats['error_types'].get(error_type, 0) + 1
                    
                    # Only print first few errors to avoid spam
                    if conversion_stats['failed'] < 5:
                        print(f"Warning: Failed to parse message {i} in task {task_id[:12]}...: {e}")
                    
                    conversion_stats['failed'] += 1
                    task_failed_count += 1
                    continue
            
            if task_failed_count > 0:
                conversion_stats['failed_tasks'].append((task_id, task_failed_count))
            
            # Store the converted data
            docent_results[zip_name][task_id] = {
                'weave_task_id': agent_run.get('weave_task_id'),
                'model': agent_run.get('model'),
                'eval': agent_run.get('eval'),
                'original_message_count': len(messages),
                'docent_message_count': len(docent_messages),
                'failed_message_count': task_failed_count,
                'docent_messages': docent_messages,  # These are now ChatMessage objects
                'original_messages': messages  # Keep original for reference
            }
    
    return docent_results, conversion_stats

In [None]:
def upload_all_transcripts_to_transluce(docent_results, collection_id, client, batch_by_model=True):
    """
    Upload all transcripts from docent_results to a transluce collection.
    
    Args:
        docent_results: Dictionary containing converted transcript data
        collection_id: The collection ID in transluce to upload to
        client: The docent client instance
        batch_by_model: If True, upload runs one model at a time instead of all at once
    
    Returns:
        Dictionary with upload statistics
    """
    
    upload_stats = {
        'total_runs': 0,
        'successful_uploads': 0,
        'failed_uploads': 0,
        'skipped_runs': 0,
        'failed_runs': []
    }
    
    print("üöÄ Processing docent_results for upload to transluce...")
    
    # Group agent runs by model if batch_by_model is True
    if batch_by_model:
        model_groups = {}
        
        for zip_name, tasks in docent_results.items():
            # Skip entries with errors
            if "error" in tasks:
                print(f"‚ö†Ô∏è  Skipping {zip_name}: contains error")
                upload_stats['skipped_runs'] += 1
                continue
                
            for task_id, agent_run_data in tasks.items():
                model = agent_run_data.get('model', 'unknown')
                if model not in model_groups:
                    model_groups[model] = []
                model_groups[model].append((zip_name, task_id, agent_run_data))
        
        print(f"üìä Found {len(model_groups)} models:")
        for model, runs in model_groups.items():
            print(f"   {model}: {len(runs)} runs")
        
        # Process each model group separately
        for model, runs in model_groups.items():
            print(f"\nüîÑ Processing model: {model} ({len(runs)} runs)")
            agent_runs = []
            
            for zip_name, task_id, agent_run_data in runs:
                upload_stats['total_runs'] += 1
                
                try:
                    # Extract metadata
                    metadata = {
                        "benchmark_id": "assistantbench",
                        "task_id": task_id,
                        "model": agent_run_data.get('model', 'unknown'),
                        "run_id": zip_name,
                        "weave_task_id": agent_run_data.get('weave_task_id'),
                        "eval": agent_run_data.get('eval'),
                        "original_message_count": agent_run_data['original_message_count'],
                        "docent_message_count": agent_run_data['docent_message_count'],
                        "failed_message_count": agent_run_data['failed_message_count']
                    }
                    
                    # Create transcript from docent messages
                    transcript = Transcript(
                        messages=agent_run_data['docent_messages'],
                        metadata=metadata
                    )
                    
                    # Create transcripts dict (AgentRun expects plural)
                    transcripts = {
                        "default": transcript
                    }
                    
                    # Create AgentRun
                    agent_run = AgentRun(
                        transcripts=transcripts,
                        metadata=metadata
                    )
                    
                    agent_runs.append(agent_run)
                        
                except Exception as e:
                    print(f"‚ùå Failed to create AgentRun for task {task_id[:12]}...: {e}")
                    upload_stats['failed_uploads'] += 1
                    upload_stats['failed_runs'].append({
                        'task_id': task_id,
                        'zip_name': zip_name,
                        'error': str(e)
                    })
                    continue
            
            # Upload this model's runs
            if agent_runs:
                try:
                    print(f"   üîÑ Uploading {len(agent_runs)} runs for {model}...")
                    client.add_agent_runs(collection_id, agent_runs)
                    upload_stats['successful_uploads'] += len(agent_runs)
                    print(f"   ‚úÖ Successfully uploaded {len(agent_runs)} runs for {model}!")
                    
                except Exception as e:
                    print(f"   ‚ùå Failed to upload runs for {model}: {e}")
                    upload_stats['failed_uploads'] += len(agent_runs)
                    for agent_run in agent_runs:
                        upload_stats['failed_runs'].append({
                            'agent_run_id': agent_run.id,
                            'model': model,
                            'error': str(e)
                        })
    
    else:
        # Original behavior - upload all at once
        agent_runs = []
        
        for zip_name, tasks in docent_results.items():
            # Skip entries with errors
            if "error" in tasks:
                print(f"‚ö†Ô∏è  Skipping {zip_name}: contains error")
                upload_stats['skipped_runs'] += 1
                continue
                
            print(f"üìÅ Processing {zip_name}...")
            
            for task_id, agent_run_data in tasks.items():
                upload_stats['total_runs'] += 1
                
                try:
                    # Extract metadata
                    metadata = {
                        "benchmark_id": "assistantbench",
                        "task_id": task_id,
                        "model": agent_run_data.get('model', 'unknown'),
                        "run_id": zip_name,
                        "weave_task_id": agent_run_data.get('weave_task_id'),
                        "eval": agent_run_data.get('eval'),
                        "original_message_count": agent_run_data['original_message_count'],
                        "docent_message_count": agent_run_data['docent_message_count'],
                        "failed_message_count": agent_run_data['failed_message_count']
                    }
                    
                    # Create transcript from docent messages
                    transcript = Transcript(
                        messages=agent_run_data['docent_messages'],
                        metadata=metadata
                    )
                    
                    # Create transcripts dict (AgentRun expects plural)
                    transcripts = {
                        "default": transcript
                    }
                    
                    # Create AgentRun
                    agent_run = AgentRun(
                        transcripts=transcripts,
                        metadata=metadata
                    )
                    
                    agent_runs.append(agent_run)
                    
                    if len(agent_runs) % 10 == 0:
                        print(f"   ‚úÖ Prepared {len(agent_runs)} agent runs...")
                        
                except Exception as e:
                    print(f"‚ùå Failed to create AgentRun for task {task_id[:12]}...: {e}")
                    upload_stats['failed_uploads'] += 1
                    upload_stats['failed_runs'].append({
                        'task_id': task_id,
                        'zip_name': zip_name,
                        'error': str(e)
                    })
                    continue
        
        print(f"üìä Prepared {len(agent_runs)} agent runs for upload")
        
        # Upload all agent runs to the collection
        if agent_runs:
            try:
                print(f"üîÑ Uploading {len(agent_runs)} agent runs to collection {collection_id}...")
                client.add_agent_runs(collection_id, agent_runs)
                upload_stats['successful_uploads'] = len(agent_runs)
                print(f"‚úÖ Successfully uploaded {len(agent_runs)} agent runs!")
                
            except Exception as e:
                print(f"‚ùå Failed to upload agent runs: {e}")
                upload_stats['failed_uploads'] += len(agent_runs)
                for agent_run in agent_runs:
                    upload_stats['failed_runs'].append({
                        'agent_run_id': agent_run.id,
                        'error': str(e)
                    })
    
    # Print final statistics
    print(f"\nüìà Upload Statistics:")
    print(f"   Total runs processed: {upload_stats['total_runs']}")
    print(f"   Successfully uploaded: {upload_stats['successful_uploads']}")
    print(f"   Failed uploads: {upload_stats['failed_uploads']}")
    print(f"   Skipped runs: {upload_stats['skipped_runs']}")
    
    if upload_stats['failed_runs']:
        print(f"   Failed runs: {len(upload_stats['failed_runs'])}")
        
    return upload_stats

In [8]:
# Find all matching ZIP files
fs = HfFileSystem()
repo_path = f"datasets/{REPO_ID}@{REVISION}"
files = fs.ls(repo_path, detail=True)

# Filter for AssistantBench browser agent files
print(f"\nLooking for AssistantBench browser agent ZIP files...")
browser_agent_zip_files = []
for file_info in files:
    if file_info['name'].endswith('.zip'):
        file_path = file_info['name']
        file_name = file_path.split('/')[-1]
        
        # Only include files that start with "assistantbench_assistantbench_browser_agent"
        if file_name.lower().startswith('assistantbench_assistantbench_browser_agent'):
            # exclude a number of specific zip files names
            excluded_files = [
                'assistantbench_assistantbench_browser_agent_claude37sonnet20250219_low_1748711087_UPLOAD.zip',
                'assistantbench_assistantbench_browser_agent_deepseekr1_1755121049_UPLOAD.zip',
                'assistantbench_assistantbench_browser_agent_gemini20flash_1746393958_UPLOAD.zip',
                'assistantbench_assistantbench_browser_agent_gpt5_1754598271_UPLOAD.zip',
                'assistantbench_assistantbench_browser_agent_o320250416_1746376643_UPLOAD.zip',
                'assistantbench_assistantbench_browser_agent_o4mini20250416_1746227177_UPLOAD.zip'

            ]
            if file_name not in excluded_files:
                file_size = file_info.get('size', 0)
                browser_agent_zip_files.append({
                    'name': file_name,
                    'size': file_size,
                    'path': file_path
            })

# Sort by name for consistent processing
browser_agent_zip_files.sort(key=lambda x: x['name'])

print(f"Found {len(browser_agent_zip_files)} AssistantBench browser agent ZIP files:")
for i, zf in enumerate(browser_agent_zip_files):
    size_mb = zf['size'] / (1024 * 1024)
    print(f"  {i+1:2d}. {zf['name']:<80} ({size_mb:>8.1f} MB)")

print(f"\nReady to process {len(browser_agent_zip_files)} files...")


Looking for AssistantBench browser agent ZIP files...
Found 12 AssistantBench browser agent ZIP files:
   1. assistantbench_assistantbench_browser_agent_claude37sonnet20250219_1746383806_UPLOAD.zip (   826.7 MB)
   2. assistantbench_assistantbench_browser_agent_claude37sonnet20250219_high_1748638033_UPLOAD.zip (   774.7 MB)
   3. assistantbench_assistantbench_browser_agent_claudeopus4120250514_1754678912_UPLOAD.zip (  1522.9 MB)
   4. assistantbench_assistantbench_browser_agent_claudeopus41_high_1755199872_UPLOAD.zip (  1464.0 MB)
   5. assistantbench_assistantbench_browser_agent_deepseekaideepseekr1_1748894163_UPLOAD.zip (    17.7 MB)
   6. assistantbench_assistantbench_browser_agent_deepseekaideepseekv3_1746233269_UPLOAD.zip (    58.5 MB)
   7. assistantbench_assistantbench_browser_agent_gemini20flash_1755192000_UPLOAD.zip  (   635.9 MB)
   8. assistantbench_assistantbench_browser_agent_gpt4120250414_1746225570_UPLOAD.zip  (   443.8 MB)
   9. assistantbench_assistantbench_browser_ag

In [10]:
# Process all browser agent files and collect results
all_results = {}  # zip_name -> {task_id -> agent_run}
overall_progress = tqdm(browser_agent_zip_files, desc="Processing ZIP files")

for file_info in overall_progress:
    zip_name = file_info['name']
    overall_progress.set_description(f"Processing {zip_name[:50]}...")
    
    try:
        # Process this ZIP file
        zip_results = {}
        
        for tid, agent_run in stream_agent_runs_by_task(
            repo_id=REPO_ID,
            zip_name=zip_name,
            revision=REVISION,
            repo_kind="datasets",
            member_name=None,
            require_model=None,
            include_eval=True,
            limit=10,  # Process all tasks
            aggregate_all=True,
        ):
            zip_results[tid] = agent_run
        
        all_results[zip_name] = zip_results
        
        print(f"\n‚úÖ {zip_name}: {len(zip_results)} tasks processed")
        
        # Show sample info
        if zip_results:
            sample_task_id = next(iter(zip_results))
            sample_run = zip_results[sample_task_id]
            print(f"   Sample: Task {sample_task_id[:12]}... has {len(sample_run.get('messages', []))} messages")
            if sample_run.get('model'):
                print(f"   Model: {sample_run['model']}")
        
    except Exception as e:
        print(f"\n‚ùå Error processing {zip_name}: {e}")
        all_results[zip_name] = {"error": str(e)}
        continue

overall_progress.close()

print(f"\nüéâ Processing complete!")
print(f"Total files processed: {len(all_results)}")
successful_files = sum(1 for v in all_results.values() if "error" not in v)
print(f"Successful files: {successful_files}")
total_tasks = sum(len(v) for v in all_results.values() if "error" not in v)
print(f"Total tasks extracted: {total_tasks:,}")

Processing assistantbench_assistantbench_browser_agent_claude...:   8%|‚ñä         | 1/12 [00:32<05:56, 32.40s/it]


‚úÖ assistantbench_assistantbench_browser_agent_claude37sonnet20250219_1746383806_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 217 messages
   Model: claude-3-7-sonnet-20250219


Processing assistantbench_assistantbench_browser_agent_claude...:  17%|‚ñà‚ñã        | 2/12 [00:59<04:51, 29.18s/it]


‚úÖ assistantbench_assistantbench_browser_agent_claude37sonnet20250219_high_1748638033_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 203 messages
   Model: claude-3-7-sonnet-20250219


Processing assistantbench_assistantbench_browser_agent_claude...:  25%|‚ñà‚ñà‚ñå       | 3/12 [01:54<06:11, 41.24s/it]


‚úÖ assistantbench_assistantbench_browser_agent_claudeopus4120250514_1754678912_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 284 messages
   Model: anthropic/claude-opus-4.1


Processing assistantbench_assistantbench_browser_agent_deepse...:  33%|‚ñà‚ñà‚ñà‚ñé      | 4/12 [02:43<05:54, 44.25s/it]


‚úÖ assistantbench_assistantbench_browser_agent_claudeopus41_high_1755199872_UPLOAD.zip: 10 tasks processed
   Sample: Task 8ad84bd6fe38... has 361 messages
   Model: anthropic/claude-opus-4.1


Processing assistantbench_assistantbench_browser_agent_deepse...:  42%|‚ñà‚ñà‚ñà‚ñà‚ñè     | 5/12 [02:45<03:23, 29.04s/it]


‚úÖ assistantbench_assistantbench_browser_agent_deepseekaideepseekr1_1748894163_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 28 messages
   Model: deepseek-ai/DeepSeek-R1


Processing assistantbench_assistantbench_browser_agent_gemini...:  50%|‚ñà‚ñà‚ñà‚ñà‚ñà     | 6/12 [02:49<02:01, 20.33s/it]


‚úÖ assistantbench_assistantbench_browser_agent_deepseekaideepseekv3_1746233269_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 86 messages
   Model: deepseek-ai/DeepSeek-V3


Processing assistantbench_assistantbench_browser_agent_gpt412...:  58%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñä    | 7/12 [03:10<01:43, 20.75s/it]


‚úÖ assistantbench_assistantbench_browser_agent_gemini20flash_1755192000_UPLOAD.zip: 10 tasks processed
   Sample: Task 929b45f34805... has 180 messages
   Model: google/gemini-2.0-flash-001


Processing assistantbench_assistantbench_browser_agent_gpt5_1...:  67%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñã   | 8/12 [03:25<01:15, 18.77s/it]


‚úÖ assistantbench_assistantbench_browser_agent_gpt4120250414_1746225570_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 119 messages
   Model: gpt-4.1


Processing assistantbench_assistantbench_browser_agent_o32025...:  75%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñå  | 9/12 [04:00<01:11, 23.81s/it]


‚úÖ assistantbench_assistantbench_browser_agent_gpt5_1754633901_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 282 messages
   Model: gpt-5


Processing assistantbench_assistantbench_browser_agent_o4mini...:  83%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñé | 10/12 [04:19<00:44, 22.27s/it]


‚úÖ assistantbench_assistantbench_browser_agent_o320250416_1750104946_UPLOAD.zip: 10 tasks processed
   Sample: Task 6b06d186921b... has 152 messages
   Model: o3


Processing assistantbench_assistantbench_browser_agent_o4mini...:  92%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñè| 11/12 [04:41<00:22, 22.29s/it]


‚úÖ assistantbench_assistantbench_browser_agent_o4mini20250416_high_1746297478_UPLOAD.zip: 10 tasks processed
   Sample: Task efc0f3a47e9e... has 248 messages
   Model: o4-mini


Processing assistantbench_assistantbench_browser_agent_o4mini...: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 12/12 [05:00<00:00, 25.08s/it]


‚úÖ assistantbench_assistantbench_browser_agent_o4mini20250416_low_1746374633_UPLOAD.zip: 10 tasks processed
   Sample: Task 4e615af6f034... has 26 messages
   Model: o4-mini

üéâ Processing complete!
Total files processed: 12
Successful files: 12
Total tasks extracted: 120





In [None]:
# Perform the conversion
docent_results, stats = convert_to_docent_messages(all_results)

print(f"\n‚úÖ Conversion complete!")
print(f"üìä Message conversion stats:")
print(f"   Total messages: {stats['total_messages']}")
print(f"   Successfully converted: {stats['successful']}")
print(f"   Failed to convert: {stats['failed']}")
if stats['total_messages'] > 0:
    print(f"   Success rate: {stats['successful']/stats['total_messages']*100:.1f}%")

if stats['error_types']:
    print(f"\n‚ö†Ô∏è  Error types:")
    for error_type, count in stats['error_types'].items():
        print(f"   {error_type}: {count}")

if stats['failed_tasks']:
    print(f"\nüìã Tasks with failed messages: {len(stats['failed_tasks'])}")
    for task_id, failed_count in stats['failed_tasks'][:5]:  # Show first 5
        print(f"   {task_id[:20]}...: {failed_count} failed messages")

print(f"\nüìÅ Converted {sum(len(tasks) for tasks in docent_results.values() if 'error' not in tasks)} agent runs")

In [None]:
# Example usage with model batching:
docent_api_key = os.getenv("DOCENT_API_KEY")
if not docent_api_key:
    raise ValueError("DOCENT_API_KEY environment variable is required")

client = Docent(
    api_key=docent_api_key,
)

collection_id = client.create_collection(
    name="assistantbench_batched",
    description="Testing integration of Hal and Docent with AssistantBench (batched by model)",
)

upload_stats = upload_all_transcripts_to_transluce(docent_results, collection_id, client, batch_by_model=True)

In [12]:
# Save results to JSON file
import json, base64
from datetime import date, datetime
from decimal import Decimal

def _json_default(o):
    if isinstance(o, Decimal):
        return format(o, "f")  # keep exact value as a string
    if isinstance(o, (datetime, date)):
        return o.isoformat()
    if isinstance(o, bytes):
        return base64.b64encode(o).decode("ascii")
    # optional: numpy support
    try:
        import numpy as np
        if isinstance(o, (np.integer, np.floating, np.bool_)):
            return o.item()
        if isinstance(o, np.ndarray):
            return o.tolist()
    except Exception:
        pass
    return str(o)

output_file = "assistantbench_browser_agent_runs.json"
with open(output_file, "w") as f:
    json.dump(all_results, f, indent=2, default=_json_default, sort_keys=True)

file_size_mb = os.path.getsize(output_file) / (1024 * 1024)
print(f"‚úÖ Saved complete results to: {output_file}")
print(f"üìä File size: {file_size_mb:.1f} MB")
print(f"üìã Contains {len(all_results)} ZIP files with {sum(len(v) for v in all_results.values() if 'error' not in v)} total tasks")

‚úÖ Saved complete results to: assistantbench_browser_agent_runs.json
üìä File size: 17.5 MB
üìã Contains 12 ZIP files with 120 total tasks
