In [None]:
from docent import Docent
import pandas as pd
from tqdm import tqdm
import os


DOCENT_API_KEY = os.getenv("DOCENT_API_KEY")

client = Docent(
    api_key=DOCENT_API_KEY,
)

collections = client.list_collections()

In [6]:
collections

[{'id': 'b64b3811-7210-46f1-84fb-c97693ac4f56',
  'name': 'scicode_scicode_tool_calling_agent_all_tasks',
  'description': 'HAL paper analysis: scicode_scicode_tool_calling_agent_all_tasks',
  'created_by': 'b684fd99-8fa8-4134-9699-9b6cde426cb7',
  'created_at': '2025-09-15T21:20:29.554919'},
 {'id': '02879d7f-5e50-4bca-8dd7-33a39da4762d',
  'name': 'assistantbench_assistantbench_browser_agent_all_tasks',
  'description': 'HAL paper analysis: assistantbench_assistantbench_browser_agent_all_tasks',
  'created_by': 'b684fd99-8fa8-4134-9699-9b6cde426cb7',
  'created_at': '2025-09-14T20:02:41.987507'},
 {'id': 'e80ca13a-028c-4cc8-9ea5-1977aa04297d',
  'name': 'taubench_airline_taubench_fewshot_all_tasks',
  'description': 'HAL paper analysis: taubench_airline_taubench_fewshot_all_tasks',
  'created_by': 'b684fd99-8fa8-4134-9699-9b6cde426cb7',
  'created_at': '2025-09-14T19:16:56.519991'},
 {'id': 'db38b00b-3097-4e63-abc6-b36a7c58a7e8',
  'name': 'corebench_hard_coreagent_all_tasks',
  'des

In [7]:
collection_id = "b64b3811-7210-46f1-84fb-c97693ac4f56"

In [8]:
rubrics = client.list_rubrics(collection_id)

In [9]:
rubrics

[{'id': '83762c94-012f-4648-bec5-5a6f24582f03',
  'version': 3,
  'rubric_text': 'Instruction Violation: SciCode is evaluated by running the final cell of the agent output against a set of unit tests. As a result, the test will automatically fail if there are any syntax errors or text in the final cell. Is the final cell in the log something other than correctly formatted Python code?\n\nLabel as a match all runs with a Python syntax error in the final cell which would trigger an error if the full cell was run in a Python interpreter. Label as no match all other runs.',
  'judge_model': {'provider': 'openai',
   'model_name': 'gpt-5',
   'reasoning_effort': 'medium'},
  'output_schema': {'type': 'object',
   'properties': {'label': {'enum': ['match', 'no match'], 'type': 'string'},
    'explanation': {'type': 'string', 'citations': True}}}},
 {'id': 'e9d7dd28-2454-41d5-b493-7c8e9ed431c0',
  'version': 3,
  'rubric_text': 'Tool use: The agent scaffold in SciCode gives the agent access t

In [None]:
rubric_id = "31002fcf-8750-4615-9cdd-28206aad0636"
run_state = client.get_rubric_run_state(collection_id, rubric_id)

In [None]:
run_state_df = pd.DataFrame(run_state['results'])

# Collect all run data and metadata
run_data = []

print(f"Processing {len(run_state['results'])} runs...")

for run in tqdm(run_state['results']):
    run_id = run.get('agent_run_id')
    
    try:
        # Get the run info from docent
        run_info = client.get_agent_run(collection_id, run_id)
        metadata = run_info.metadata
        
        # Extract basic run info
        row_data = {
            'agent_run_id': run_id,
            'benchmark_id': metadata.get('benchmark_id'),
            'model': metadata.get('model'),
            'task_id': metadata.get('task_id'),
            'weave_task_id': metadata.get('weave_task_id'),
            'reasoning_effort': metadata.get('reasoning_effort'),
            'eval_is_successful': metadata.get('eval_is_successful'),
            'original_message_count': metadata.get('original_message_count'),
            'docent_message_count': metadata.get('docent_message_count'),
            'failed_message_count': metadata.get('failed_message_count'),
        }
        
        # Add all other metadata fields with prefix
        for key, value in metadata.items():
            if key not in row_data:  # Don't duplicate already extracted fields
                # Handle lists/arrays by converting to string or counting
                if isinstance(value, list):
                    if key.endswith('_tasks'):
                        row_data[f"{key}_count"] = len(value)
                        # Only store first few items to avoid huge cells
                        if len(value) <= 5:
                            row_data[key] = str(value)
                    else:
                        row_data[key] = str(value)
                else:
                    row_data[key] = value
        
        # Add transcript info - handle both list and dict formats
        if run_info.transcripts:
            if isinstance(run_info.transcripts, dict):
                # Dictionary format - look for 'default' key
                if 'default' in run_info.transcripts:
                    transcript = run_info.transcripts['default']
                    row_data['transcript_id'] = transcript.id
                    row_data['transcript_created_at'] = transcript.created_at
                    row_data['transcript_message_count'] = len(transcript.messages)
            elif isinstance(run_info.transcripts, list) and len(run_info.transcripts) > 0:
                # List format - take the first transcript
                transcript = run_info.transcripts[0]
                row_data['transcript_id'] = transcript.id
                row_data['transcript_created_at'] = transcript.created_at
                row_data['transcript_message_count'] = len(transcript.messages)
        
        run_data.append(row_data)
        
    except Exception as e:
        print(f"Error processing run {run_id}: {e}")
        # Add error row with basic info
        run_data.append({
            'agent_run_id': run_id,
            'error': str(e)
        })

# Create DataFrame
df = pd.DataFrame(run_data)

df.head()

# join run_state_df with run_data
merged_df = pd.merge(run_state_df, df, on='agent_run_id', how='left')

Processing 616 runs...


100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 616/616 [02:45<00:00,  3.72it/s]

âœ… Created DataFrame with 616 rows and 19 columns
ðŸ“Š Columns: ['agent_run_id', 'benchmark_id', 'model', 'task_id', 'weave_task_id', 'reasoning_effort', 'eval_is_successful', 'original_message_count', 'docent_message_count', 'failed_message_count', 'run_id', 'eval_failed_tasks_count', 'eval_successful_tasks_count', 'eval_successful_tasks', 'eval_successful_subtasks', 'eval_has_successful_subtasks', 'transcript_id', 'transcript_created_at', 'transcript_message_count']
ðŸ“Š Successful runs: 25
ðŸ“Š Models: {'openai/o4-mini-2025-04-16_low': 62, 'openai/gemini-2.0-flash': 61, 'gpt-4.1-2025-04-14': 60, 'openai/o4-mini-2025-04-16_high': 60, 'openai/claude-3-7-sonnet-20250219_high': 58, 'openai/claude-3-7-sonnet-20250219': 58, 'openai/o3-2025-04-16': 58, 'openai/gpt-5': 57, 'openai/deepseek-ai/DeepSeek-V3': 55, 'openrouter/anthropic/claude-opus-4.1': 45, 'openai/deepseek-ai/DeepSeek-R1': 42}





Unnamed: 0,agent_run_id,benchmark_id,model,task_id,weave_task_id,reasoning_effort,eval_is_successful,original_message_count,docent_message_count,failed_message_count,run_id,eval_failed_tasks_count,eval_successful_tasks_count,eval_successful_tasks,eval_successful_subtasks,eval_has_successful_subtasks,transcript_id,transcript_created_at,transcript_message_count
0,c99750eb-391c-4416-af0e-37e1d88c3249,scicode,openai/claude-3-7-sonnet-20250219_high,27,27,high,False,85,85,0,scicode_scicode_tool_calling_agent_claude37son...,62,3,"['74', '25', '30']",[],False,1bf5f9a9-c7dd-4b64-b9c1-8331684187cb,2025-09-15 21:20:46.883184,85
1,193fa9e6-f306-4884-aacc-f8b46b451af9,scicode,openai/claude-3-7-sonnet-20250219,35,35,,False,88,88,0,scicode_scicode_tool_calling_agent_claude37son...,63,2,"['74', '36']",['35.1'],True,3108978d-b4db-4632-9a95-c93ddd3c2f69,2025-09-15 21:20:37.011008,88
2,9d7383f6-e278-4fb9-aeb2-3620cc1b13b7,scicode,openai/claude-3-7-sonnet-20250219_high,35,35,high,False,85,85,0,scicode_scicode_tool_calling_agent_claude37son...,62,3,"['74', '25', '30']",[],False,b9deaa65-5a5e-48ce-b7a7-a1124dfda0c3,2025-09-15 21:20:46.883193,85
3,9ebb1c86-9296-47ec-9386-8d4897ae28ba,scicode,openai/claude-3-7-sonnet-20250219_high,15,15,high,False,58,58,0,scicode_scicode_tool_calling_agent_claude37son...,62,3,"['74', '25', '30']",['15.1'],True,0d5487f6-f4da-4d09-8065-ffb0b42f5a2c,2025-09-15 21:20:46.883170,58
4,2223fd62-676e-46af-867e-6f19d7f5df10,scicode,openai/claude-3-7-sonnet-20250219_high,31,31,high,False,84,84,0,scicode_scicode_tool_calling_agent_claude37son...,62,3,"['74', '25', '30']",['31.1'],True,bc0819d1-388b-44f9-bc7f-6b840e95eae2,2025-09-15 21:20:46.883188,84


In [68]:
# Filter out rows where output['label'] == 'no match' and expand output keys into columns
def expand_output_column(df, output_col='output'):
    """
    Filter out 'no match' labels and expand output dictionary keys into separate columns.
    """
    # Create a copy to avoid modifying original
    expanded_df = df.copy()
    
    # Extract label for filtering
    expanded_df['label'] = expanded_df[output_col].apply(
        lambda x: x.get('label') if isinstance(x, dict) else None
    )
    
    # Expand all keys from the output dictionary
    output_expanded = pd.json_normalize(expanded_df[output_col])
    
    # Add prefix to avoid column name conflicts
    output_expanded.columns = [f'output_{col}' for col in output_expanded.columns]
    
    # Reset indices to align properly
    filtered_df_reset = expanded_df.reset_index(drop=True)
    output_expanded_reset = output_expanded.reset_index(drop=True)
    
    # Concatenate the original columns (minus output) with expanded columns
    final_df = pd.concat([
        filtered_df_reset.drop(columns=[output_col]),
        output_expanded_reset
    ], axis=1)
    
    return final_df

# Apply the function
expanded_df = expand_output_column(merged_df)

print(f"Original shape: {merged_df.shape}")
print(f"Filtered & expanded shape: {expanded_df.shape}")
print(f"Rows with 'no match' removed: {merged_df.shape[0] - expanded_df.shape[0]}")
print(f"\nNew columns from output expansion:")
new_cols = [col for col in expanded_df.columns if col.startswith('output_')]
print(new_cols)

# Show the first few rows
expanded_df.head()

Original shape: (616, 25)
Filtered & expanded shape: (616, 28)
Rows with 'no match' removed: 0

New columns from output expansion:
['output_label', 'output_explanation.text', 'output_explanation.citations']


Unnamed: 0,id,agent_run_id,rubric_id,rubric_version,value,result_type,benchmark_id,model,task_id,weave_task_id,...,eval_successful_tasks,eval_successful_subtasks,eval_has_successful_subtasks,transcript_id,transcript_created_at,transcript_message_count,label,output_label,output_explanation.text,output_explanation.citations
0,4b3c7a80-4033-4ea1-8da8-70eeb1769d02,c99750eb-391c-4416-af0e-37e1d88c3249,31002fcf-8750-4615-9cdd-28206aad0636,2,,direct_result,scicode,openai/claude-3-7-sonnet-20250219_high,27,27,...,"['74', '25', '30']",[],False,1bf5f9a9-c7dd-4b64-b9c1-8331684187cb,2025-09-15 21:20:46.883184,85,match,match,The agent encountered an environmental barrier...,"[{'start_idx': 175, 'end_idx': 179, 'agent_run..."
1,d83409ad-a78b-473d-9c0a-96188bcc3fd1,193fa9e6-f306-4884-aacc-f8b46b451af9,31002fcf-8750-4615-9cdd-28206aad0636,2,,direct_result,scicode,openai/claude-3-7-sonnet-20250219,35,35,...,"['74', '36']",['35.1'],True,3108978d-b4db-4632-9a95-c93ddd3c2f69,2025-09-15 21:20:37.011008,88,no match,no match,The agent did encounter an environmental restr...,"[{'start_idx': 208, 'end_idx': 213, 'agent_run..."
2,4501b779-18f4-4433-91a5-e78bed1f599a,9d7383f6-e278-4fb9-aeb2-3620cc1b13b7,31002fcf-8750-4615-9cdd-28206aad0636,2,,direct_result,scicode,openai/claude-3-7-sonnet-20250219_high,35,35,...,"['74', '25', '30']",[],False,b9deaa65-5a5e-48ce-b7a7-a1124dfda0c3,2025-09-15 21:20:46.883193,85,match,match,The agent encountered an environmental restric...,"[{'start_idx': 278, 'end_idx': 283, 'agent_run..."
3,5b5f4408-422e-4a05-a0d4-0355ec8a1292,9ebb1c86-9296-47ec-9386-8d4897ae28ba,31002fcf-8750-4615-9cdd-28206aad0636,2,,direct_result,scicode,openai/claude-3-7-sonnet-20250219_high,15,15,...,"['74', '25', '30']",['15.1'],True,0d5487f6-f4da-4d09-8065-ffb0b42f5a2c,2025-09-15 21:20:46.883170,58,match,match,The agent encountered environmental/tooling ba...,"[{'start_idx': 169, 'end_idx': 173, 'agent_run..."
4,d638ca0e-49b9-4959-8c41-be41bba1e714,2223fd62-676e-46af-867e-6f19d7f5df10,31002fcf-8750-4615-9cdd-28206aad0636,2,,direct_result,scicode,openai/claude-3-7-sonnet-20250219_high,31,31,...,"['74', '25', '30']",['31.1'],True,bc0819d1-388b-44f9-bc7f-6b840e95eae2,2025-09-15 21:20:46.883188,84,match,match,The run encountered clear environmental barrie...,"[{'start_idx': 209, 'end_idx': 214, 'agent_run..."


In [71]:
# keep columns benchmark, model, task_id, agent_run_id, value, citation, eval_is_successful, docent_message_count
expanded_df = expanded_df[['benchmark_id', 'model', 'task_id', 'agent_run_id', 'eval_has_successful_subtasks', 'eval_is_successful', 'label', 'output_explanation.text']]

In [73]:
expanded_df.to_csv("../results/rubrics/scicode_environmentalbarrier.csv", index=False)

In [23]:
# randomly sample 5 rows from the merged_df, stratified by model
sampled_df = expanded_df.groupby(['model', 'label']).apply(lambda x: x.sample(n=3, random_state=42) if len(x) > 4 else x)

  sampled_df = expanded_df.groupby(['model', 'label']).apply(lambda x: x.sample(n=3, random_state=42) if len(x) > 4 else x)


In [25]:
sampled_df.to_csv("../results/validations/corebench_verification.csv", index=False)

In [62]:
# compute conditional probability of eval_is_successful given label
cond_probs = expanded_df.groupby(['label'])['eval_has_successful_subtasks'].mean().reset_index()
cond_probs = cond_probs.rename(columns={'eval_has_successful_subtasks': 'cond_prob_success'})
cond_probs

Unnamed: 0,label,cond_prob_success
0,match,0.483296
1,no match,0.313953


In [74]:
# convert label to boolean with match = True and no match = False
expanded_df['label'] = expanded_df['label'].map({'match': True, 'no match': False})
expanded_df['label'] = expanded_df['label'].astype(bool)

# compute conditional probability of label given eval_is_successful
expanded_df['eval_is_successful'] = expanded_df['eval_is_successful'].astype(bool)
cond_probs = expanded_df.groupby(['eval_is_successful'])['label'].mean().reset_index()
cond_probs


Unnamed: 0,eval_is_successful,label
0,False,0.43824
1,True,0.28


In [149]:
# compute conditional probability of eval_is_successful given label
cond_probs = expanded_df.groupby(['label'])['eval_answer'].mean().reset_index()
cond_probs = cond_probs.rename(columns={'eval_answer': 'cond_prob_success'})
cond_probs

Unnamed: 0,label,cond_prob_success
0,match,0.756303
1,no match,0.515504


In [159]:
# filter to eval_score > 0
# expanded_df = expanded_df[expanded_df['eval_score'] > 0]

# Make a flag for is_successful if eval_score > 0.3
expanded_df['eval_is_successful'] = expanded_df['eval_score'] > 0.75

# convert label to boolean with match = True and no match = False
expanded_df['label'] = expanded_df['label'].map({'match': True, 'no match': False})
expanded_df['label'] = expanded_df['label'].astype(bool)

# compute conditional probability of label given eval_is_successful
cond_probs = expanded_df.groupby(['eval_answer'])['label'].mean().reset_index()
cond_probs

Unnamed: 0,eval_answer,label
0,0.0,0.564103
1,1.0,0.022936
