In [1]:
import os

# Get the absolute path of the current script
current_path = os.getcwd()

# Move up directories until we find the project's root (assumed to have a .git folder or another marker)
while (
    not os.path.exists(os.path.join(current_path, '.git'))
    and os.path.dirname(current_path) != current_path
):
    current_path = os.path.dirname(current_path)  # Move one level up

# Change the working directory to the detected root
os.chdir(current_path)

print(f'Changed working directory to: {os.getcwd()}')

Changed working directory to: /Users/mayte/GitHub/OpenHands


In [2]:
import json
import os
import re

import pandas as pd

# Define the directory to search
base_dir = 'logs/step_tasks'

In [3]:
# Regular expressions for filtering subfolders and JSON files
subfolder_pattern = re.compile(r'step_\d+_openhands_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}')
json_file_pattern = re.compile(r'\d{4}-\d{2}-\d{2}_\d{2}-\d{2}_metrics\.json$')

In [4]:
# List to store data
data_list = []

# Walk through the directory
for root, dirs, files in os.walk(base_dir):
    # Check if the current directory matches the required subfolder pattern
    if not subfolder_pattern.search(os.path.basename(root)):
        continue  # Skip directories that don’t match

    for file in files:
        if json_file_pattern.match(file):  # Match JSON files
            file_path = os.path.join(root, file)

            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)

                    # Flatten JSON and add metadata
                    # json_data["source_file"] = file
                    # json_data["source_folder"] = os.path.basename(root)

                    data_list.append(json_data)

            except (json.JSONDecodeError, OSError) as e:
                print(f'Error reading {file_path}: {e}')

In [5]:
# Create DataFrame
df = pd.DataFrame(data_list)

In [6]:
df.head()

Unnamed: 0,timestamp,agent_name,query,folder_name,full_runtime,start_url,final_answer,shortest_time_difference,longest_time_difference,mean_time_difference,...,domain_counts,difficulty,task_id,checkpoints,checkpoint_provided_ratio,checkpoint_expected_ratio,success,text_model,vision_model,multi_agent
0,2025-02-20_16-02,openhands_visual_browsing_agent,What’s the nearest station from the following ...,logs/logs/step_tasks/step_05_openhands_2025-02...,21.065859,,The nearest station to the property is Sheffie...,3.971597,7.137622,5.554609,...,{'www.onthemarket.com': 1},step_05,0,"{'provided': [''], 'expected': ['Spring Lane']}",1.0,0.0,0,1,1,0
1,2025-02-21_19-09,openhands_visual_browsing_agent,Which of the first four listings on 'https://w...,logs/logs/step_tasks/step_06_openhands_2025-02...,140.294708,,The listings that offer off-street parking are...,4.024077,11.306044,7.269007,...,{'www.primelocation.com': 11},step_06,3,"{'provided': [''], 'expected': ['Semi-detached...",1.0,1.0,1,1,1,0
2,2025-02-19_22-43,openhands_visual_browsing_agent,Locate the apartment with the highest price am...,logs/logs/step_tasks/step_04_openhands_2025-02...,59.122161,,"The apartment with the highest price is £8,883...",4.08834,6.800825,5.329784,...,{'www.primelocation.com': 2},step_04,2,"{'provided': [''], 'expected': ['Holland Stree...",1.0,1.0,1,1,1,0
3,2025-02-21_15-38,openhands_visual_browsing_agent,Which is the closest outstanding school to thi...,logs/logs/step_tasks/step_05_openhands_2025-02...,136.308496,,,3.506986,10.710707,6.704902,...,"{'www.onthemarket.com': 2, 'consent.google.com...",step_05,4,"{'provided': [], 'expected': ['Chapel Break In...",0.0,0.0,0,1,1,0
4,2025-02-21_20-02,openhands_visual_browsing_agent,Which of the first four listings on 'https://w...,logs/logs/step_tasks/step_06_openhands_2025-02...,25.034784,,The listings suitable for a student are: Room ...,4.71938,9.774738,7.247059,...,{'www.primelocation.com': 1},step_06,3,"{'provided': [''], 'expected': ['Room to rent ...",1.0,1.0,1,1,1,0


In [7]:
cols = df.columns.tolist()
for col in cols:
    print(f"'{col}',")

'timestamp',
'agent_name',
'query',
'folder_name',
'full_runtime',
'start_url',
'final_answer',
'shortest_time_difference',
'longest_time_difference',
'mean_time_difference',
'median_time_difference',
'number_of_timestamps',
'errors',
'input_tokens',
'output_tokens',
'total_tokens',
'model_name',
'screenshots',
'model_calls',
'num_webpage_visits',
'visited_urls',
'domain_counts',
'difficulty',
'task_id',
'checkpoints',
'checkpoint_provided_ratio',
'checkpoint_expected_ratio',
'success',
'text_model',
'vision_model',
'multi_agent',


In [8]:
df_filtered = df[
    [
        'agent_name',
        'difficulty',
        'task_id',
        'query',
        'final_answer',
        'success',
        'checkpoint_provided_ratio',
        'checkpoint_expected_ratio',
        'checkpoints',
        'model_name',
        'screenshots',
        'model_calls',
        'input_tokens',
        'output_tokens',
        'total_tokens',
        'full_runtime',
        'text_model',
        'vision_model',
        'multi_agent',
        'timestamp',
    ]
]
# df_filtered sort by difficulty and task_id
df_filtered = df_filtered.sort_values(by=['agent_name', 'difficulty', 'task_id'])
df_filtered.head()

Unnamed: 0,agent_name,difficulty,task_id,query,final_answer,success,checkpoint_provided_ratio,checkpoint_expected_ratio,checkpoints,model_name,screenshots,model_calls,input_tokens,output_tokens,total_tokens,full_runtime,text_model,vision_model,multi_agent,timestamp
16,openhands_browsing_agent,step_02,0,Find rentals in Bristol on 'https://www.onthem...,Please provide the current URL of the search r...,1,1.0,1.0,"{'provided': ['onthemarket.com'], 'expected': ...",gpt-4o,0,7,78355,529,78884,84.337136,1,0,0,2025-02-15_21-27
32,openhands_browsing_agent,step_02,1,Look at the filters available on 'https://www....,Too many errors encountered. Task failed.,0,1.0,0.0,"{'provided': ['primelocation.com', '/to-rent',...",gpt-4o,0,12,29063,656,29719,75.2342,1,0,0,2025-02-15_19-44
23,openhands_browsing_agent,step_02,2,Filter for furnished apartments in Denver on C...,https://denver.craigslist.org/search/apa?furni...,0,1.0,0.0,"{'provided': ['denver.craigslist.org', '/apa']...",gpt-4o,0,3,16913,158,17071,38.36151,1,0,0,2025-02-15_20-36
19,openhands_browsing_agent,step_02,3,Navigate to 'https://www.onthemarket.com/to-re...,,1,1.0,1.0,"{'provided': ['onthemarket.com', 'bristol', '/...",gpt-4o,0,15,32989,1259,34248,109.258129,1,0,0,2025-02-15_21-01
13,openhands_browsing_agent,step_02,4,Look at the filters available on 'https://www....,,0,1.0,0.0,"{'provided': ['onthemarket.com', '/to-rent', '...",gpt-4o,0,14,37511,2530,40041,110.636989,1,0,0,2025-02-16_00-33


In [9]:
df_filtered.head(30)

Unnamed: 0,agent_name,difficulty,task_id,query,final_answer,success,checkpoint_provided_ratio,checkpoint_expected_ratio,checkpoints,model_name,screenshots,model_calls,input_tokens,output_tokens,total_tokens,full_runtime,text_model,vision_model,multi_agent,timestamp
16,openhands_browsing_agent,step_02,0,Find rentals in Bristol on 'https://www.onthem...,Please provide the current URL of the search r...,1,1.0,1.0,"{'provided': ['onthemarket.com'], 'expected': ...",gpt-4o,0,7,78355,529,78884,84.337136,1,0,0,2025-02-15_21-27
32,openhands_browsing_agent,step_02,1,Look at the filters available on 'https://www....,Too many errors encountered. Task failed.,0,1.0,0.0,"{'provided': ['primelocation.com', '/to-rent',...",gpt-4o,0,12,29063,656,29719,75.2342,1,0,0,2025-02-15_19-44
23,openhands_browsing_agent,step_02,2,Filter for furnished apartments in Denver on C...,https://denver.craigslist.org/search/apa?furni...,0,1.0,0.0,"{'provided': ['denver.craigslist.org', '/apa']...",gpt-4o,0,3,16913,158,17071,38.36151,1,0,0,2025-02-15_20-36
19,openhands_browsing_agent,step_02,3,Navigate to 'https://www.onthemarket.com/to-re...,,1,1.0,1.0,"{'provided': ['onthemarket.com', 'bristol', '/...",gpt-4o,0,15,32989,1259,34248,109.258129,1,0,0,2025-02-15_21-01
13,openhands_browsing_agent,step_02,4,Look at the filters available on 'https://www....,,0,1.0,0.0,"{'provided': ['onthemarket.com', '/to-rent', '...",gpt-4o,0,14,37511,2530,40041,110.636989,1,0,0,2025-02-16_00-33
38,openhands_browsing_agent,step_02,4,Look at the filters available on 'https://www....,,0,0.0,0.0,"{'provided': ['onthemarket.com', '/to-rent', '...",gpt-4o,0,15,12776,1013,13789,51.117543,1,0,0,2025-02-16_00-07
21,openhands_visual_browsing_agent,step_02,0,Find rentals in Bristol on 'https://www.onthem...,https://www.onthemarket.com/to-rent/property/b...,1,1.0,1.0,"{'provided': ['onthemarket.com'], 'expected': ...",gpt-4o,5,5,34519,359,34878,77.14643,1,1,0,2025-02-15_18-04
20,openhands_visual_browsing_agent,step_02,1,Look at the filters available on 'https://www....,https://www.primelocation.com/to-rent/property...,1,1.0,1.0,"{'provided': ['primelocation.com', '/to-rent',...",gpt-4o,7,7,78568,496,79064,133.212868,1,1,0,2025-02-17_12-27
29,openhands_visual_browsing_agent,step_02,2,Filter for furnished apartments in Denver on C...,https://denver.craigslist.org/search/apa?furni...,0,1.0,0.0,"{'provided': ['denver.craigslist.org', '/apa']...",gpt-4o,3,3,112643,226,112869,72.494177,1,1,0,2025-02-15_20-46
15,openhands_visual_browsing_agent,step_02,3,Navigate to 'https://www.onthemarket.com/to-re...,,1,1.0,1.0,"{'provided': ['onthemarket.com', 'bristol', '/...",gpt-4o,14,14,179310,850,180160,129.443707,1,1,0,2025-02-15_21-19


In [10]:
df_filtered.tail()

Unnamed: 0,agent_name,difficulty,task_id,query,final_answer,success,checkpoint_provided_ratio,checkpoint_expected_ratio,checkpoints,model_name,screenshots,model_calls,input_tokens,output_tokens,total_tokens,full_runtime,text_model,vision_model,multi_agent,timestamp
35,openhands_visual_browsing_agent,step_07,1,"For the following listings, extract 'Name', 'P...","Listing 3: Name: 2 bedroom apartment to rent, ...",0,1.0,0.0,"{'provided': [], 'expected': [{'Name': '2 bedr...",gpt-4o,8,8,31238,920,32158,75.146308,1,1,0,2025-02-21_22-21
17,openhands_visual_browsing_agent,step_07,2,"For the following three listings, ('https://ww...","Data collected: \n1. Listing 1: Price: £7,150 ...",1,1.0,1.0,"{'provided': [], 'expected': ['£7,150 pcm', '1...",gpt-4o,4,4,27485,602,28087,43.13796,1,1,0,2025-02-21_22-41
14,openhands_visual_browsing_agent,step_07,3,"For the following three listings, ('https://ww...","{""listings"": [{""url"": ""https://www.primelocati...",0,1.0,0.66,"{'provided': [], 'expected': ['{""listings"": [{...",gpt-4o,8,8,49791,864,50655,134.353584,1,1,0,2025-02-21_22-47
6,openhands_visual_browsing_agent,step_07,4,For the following overview site 'https://orego...,"{""listings"": [{""price"": ""$795"", ""bedrooms"": ""N...",0,1.0,0.73,"{'provided': [], 'expected': ['{""listings"": [{...",gpt-4o,2,2,47156,401,47557,37.132548,1,1,0,2025-02-21_23-08
30,openhands_visual_browsing_agent,step_07,5,For the following overview site 'https://orego...,Too many errors encountered. Task failed.,0,1.0,0.92,"{'provided': [], 'expected': ['{""listings"": [{...",gpt-4o,22,22,767805,8422,776227,448.411798,1,1,0,2025-02-21_23-27


In [11]:
# df_filtered to csv
df_filtered.to_csv('logs/step_tasks/step_tasks.csv', index=False, sep=';')

In [12]:
df_filtered.columns

Index(['agent_name', 'difficulty', 'task_id', 'query', 'final_answer',
       'success', 'checkpoint_provided_ratio', 'checkpoint_expected_ratio',
       'checkpoints', 'model_name', 'screenshots', 'model_calls',
       'input_tokens', 'output_tokens', 'total_tokens', 'full_runtime',
       'text_model', 'vision_model', 'multi_agent', 'timestamp'],
      dtype='object')

### Summary of stepwise tasks

In [13]:
# aggregate per 'difficulty'

# add column "number of tasks" how many tasks per difficulty were used to calculate the mean
df_summary = (
    df_filtered.groupby('difficulty')
    .agg(
        success=('success', 'mean'),
        checkpoint_provided_ratio=('checkpoint_provided_ratio', 'mean'),
        checkpoint_expected_ratio=('checkpoint_expected_ratio', 'mean'),
        model_calls=('model_calls', 'mean'),
        total_tokens=('total_tokens', 'mean'),
        full_runtime=('full_runtime', 'mean'),
        number_of_tasks=('task_id', 'size'),
    )
    .round(2)
)
# rename "success" column to "success_rate"
df_summary.rename(columns={'success': 'success_rate'}, inplace=True)
# append "_mean" to all columns that are mean values
df_summary.columns = [
    f'{col}_mean' if col != 'number_of_tasks' else col for col in df_summary.columns
]
df_summary.head(10)

Unnamed: 0_level_0,success_rate_mean,checkpoint_provided_ratio_mean,checkpoint_expected_ratio_mean,model_calls_mean,total_tokens_mean,full_runtime_mean,number_of_tasks
difficulty,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
step_02,0.55,0.91,0.55,9.82,77283.82,92.96,11
step_03,1.0,1.0,1.0,6.5,33228.25,84.66,4
step_04,1.0,1.0,1.0,3.0,45188.75,51.64,4
step_05,0.67,0.89,0.67,5.89,33426.22,64.49,9
step_06,0.4,1.0,0.4,14.4,230474.4,250.24,5
step_07,0.17,1.0,0.6,7.67,157198.17,127.37,6


In [14]:
df_summary.to_csv('logs/step_tasks/step_tasks_summary.csv', sep=';')

### Steps

In [13]:
import json

import pandas as pd

In [None]:
with open(checkpoints_path, 'r', encoding='utf-8') as file:
    return json.load(file)

In [16]:
checkpoints_path = 'step_wise_evaluation/steps_definition.json'  # Update with actual checkpoint file path
with open(checkpoints_path, 'r', encoding='utf-8') as file:
    checkpoints_data = json.load(file)

In [17]:
checkpoints_data

{'step_01': {'name': '',
  'pages': [],
  'goal': '',
  'entries': [{'id': 0,
    'query': '',
    'checkpoints': {'provided': [''], 'expected': ['']}}]},
 'step_02': {'name': 'Search and Filter Application',
  'pages': ['real_estate_overview_page'],
  'goal': 'The agent inputs the required search criteria into the appropriate fields on the website.',
  'entries': [{'id': 0,
    'query': "Find rentals in Bristol on 'https://www.onthemarket.com/'. Return the URL of the search results page as your answer.",
    'checkpoints': {'provided': ['onthemarket.com'],
     'expected': ['bristol', '/to-rent']}},
   {'id': 1,
    'query': "Look at the filters available on 'https://www.primelocation.com/to-rent/property/manchester/' and set a price range of £1,000 to £2,000 per month. Return the URL of the search results page as your answer.",
    'checkpoints': {'provided': ['primelocation.com',
      '/to-rent',
      'manchester'],
     'expected': ['price_min=1000', 'price_max=2000']}},
   {'id'

In [None]:
# iterate through checkpoints_data and print the keys:
for key in checkpoints_data.keys():
    (print(key),)

step_01
step_02
step_03
step_04
step_05
step_06
step_07


In [33]:
# iterate through all keys and print the "entries"
for key in checkpoints_data.keys():
    print(key)
    for entry in checkpoints_data[key]['entries']:
        print(f"{entry['query']}")

step_01

step_02
Find rentals in Bristol on 'https://www.onthemarket.com/'. Return the URL of the search results page as your answer.
Look at the filters available on 'https://www.primelocation.com/to-rent/property/manchester/' and set a price range of £1,000 to £2,000 per month. Return the URL of the search results page as your answer.
Filter for furnished apartments in Denver on Craigslist ('https://denver.craigslist.org/search/apa'). Return the URL of the search results page as your answer.
Navigate to 'https://www.onthemarket.com/to-rent/property/bristol/' and adjust the search settings to show only listings that allow pets. Return the URL of the search results page as your answer.
Look at the filters available on 'https://www.onthemarket.com/to-rent/property/manchester/' and set a price range of £1,000 to £2,000 per month. Return the URL of the search results page as your answer.
step_03
Find properties to rent in Leeds on 'https://www.rightmove.co.uk/'. If asked to, accept all co

In [34]:
# for every key in checkpoints_data, print "goals"
for key in checkpoints_data.keys():
    print(key)
    print(checkpoints_data[key]['goal'])

step_01

step_02
The agent inputs the required search criteria into the appropriate fields on the website.
step_03
The agent interacts with dynamic elements on the page, such as search buttons and filters, to execute and refine the search.
step_04
Structured data from the search results, such as property names,prices, and basic attributes (e.g., number of bedrooms, location), is extracted.
step_05
The agent navigates to individual detail pages to extract additional, more specific information, such as property descriptions or ther unstructured data not found in the search results.
step_06
Extracted information is evaluated against the task requirements. If the details meet the criteria, they are prepared for inclusion in the results table; if not, the entry is disregarded.
step_07
Relevant data is added to the results table, after which the agent decides whether to continue to the next detail page (repeating steps 5 to 7) or return to the search results for further refinement (repeating