In [1]:
import json
import pandas as pd
from loguru import logger
import glob

# load evaluation results

In [2]:
evals_dirs = ["WebVoyager30/Notte/1743001170"]
# evals_dirs = ["WebVoyager30/BrowserUse/1743016360"]
# evals_dirs = ["WebVoyager30/Convergence/1743114165"]

In [3]:
def fetch(evals_dirs):
    results = []
    for _eval_dir in evals_dirs:
        for file in sorted(glob.glob(f"{_eval_dir}/**/results_no_screenshot.json", recursive=True)):
            try:
                with open(file, "r") as f:
                    data = json.load(f)
                    dataset = _eval_dir.split('/')[0]
                    provider = _eval_dir.split('/')[1]
                    timestamp = _eval_dir.split('/')[-1]
                    
                    d = {
                        # utils.
                        'uid': timestamp + "-" + str(data['run_id']),
                        'fatal_crash': 1 if data['eval'] is None else 0,
                        # agent perspective.
                        'agent_score': 'success' if data['success'] else 'failure', # { success, failure }
                        'agent_answer': data['agent_answer'],
                        'steps': data['steps'],
                        # metadata.
                        'dataset': dataset,
                        'provider': provider,
                        'run_id': data['run_id'],
                        'task_id': data['task']['id'],
                        'task': data['task']['question'],
                        'timestamp': timestamp,
                        'summary_file': file,
                        'webp_file': file.replace('results_no_screenshot.json', 'summary.webp'),
                        'duration': round(data['duration_in_s'], 0),
                    }

                    if data['eval'] is not None:
                        d.update({
                            # eval perspective.
                            'eval_score': data['eval']['eval'], # { success, failure, unknown }
                            'eval_reason': data['eval']['reason']
                        })

                    results.append(d)

            except Exception as e:
                logger.error(f"Error loading {file}: {e}")

    logger.info(f"fetched {len(results)} eval results in total")
    return results

In [4]:
_evals = fetch(evals_dirs)
evals_finished = [x for x in _evals if x['fatal_crash'] == 0]
evals_crashed = [x for x in _evals if x['fatal_crash'] == 1]
evals = [x for x in evals_finished]
logger.info(f"valid evals: {len(evals_finished)} | crashed evals: {len(evals_crashed)}")

[32m2025-04-08 12:15:39.103[0m | [1mINFO    [0m | [36m__main__[0m:[36mfetch[0m:[36m44[0m - [1mfetched 240 eval results in total[0m
[32m2025-04-08 12:15:39.104[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m5[0m - [1mvalid evals: 232 | crashed evals: 8[0m


In [5]:
unique_tasks = set(x['task_id'] for x in _evals)
unique_uids = set(x['uid'] for x in _evals)
logger.info(f"unique tasks: {len(unique_tasks)} | unique uids: {len(unique_uids)}")

[32m2025-04-08 12:15:39.108[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m3[0m - [1munique tasks: 30 | unique uids: 8[0m


In [6]:
only_uids = sorted(list(unique_uids))[:]
evals_finished = [x for x in evals if x['uid'] in only_uids]

# Avg over runs and then over tasks

In [7]:
success_rate_per_task_over_runs = {}

for ev in evals_finished:
    if ev['task_id'] not in success_rate_per_task_over_runs:
        success_rate_per_task_over_runs[ev['task_id']] = {
            'agent_success': 0,
            'eval_success': 0,
            'total_runs': 0
        }
    success_rate_per_task_over_runs[ev['task_id']]['total_runs'] += 1
    success_rate_per_task_over_runs[ev['task_id']]['agent_success'] += 1 if ev['agent_score'] == 'success' else 0
    success_rate_per_task_over_runs[ev['task_id']]['eval_success'] += 1 if ev['eval_score'] == 'success' else 0

for k, v in success_rate_per_task_over_runs.items():
    v['asr'] = '{:.3f}'.format(v['agent_success'] / v['total_runs'])
    v['esr'] = '{:.3f}'.format(v['eval_success'] / v['total_runs'])
    # pop _success from the dict.
    v.pop('agent_success', None)
    v.pop('eval_success', None)
    v.pop('total_runs', None)

In [8]:
r = dict(sorted(success_rate_per_task_over_runs.items(), key=lambda x: float(x[1]['esr']), reverse=True))
df = pd.DataFrame.from_dict(r, orient='index')
display(df)

Unnamed: 0,asr,esr
webvoyager--Allrecipes--0,1.0,1.0
webvoyager--Amazon--10,1.0,1.0
webvoyager--Apple--0,1.0,1.0
webvoyager--ArXiv--10,1.0,1.0
webvoyager--Booking--0,1.0,1.0
webvoyager--Cambridge Dictionary--25,1.0,1.0
webvoyager--Coursera--16,1.0,1.0
webvoyager--Coursera--29,1.0,1.0
webvoyager--ESPN--0,1.0,1.0
webvoyager--GitHub--40,1.0,1.0


In [9]:
asr = [success_rate_per_task_over_runs[k]['asr'] for k in success_rate_per_task_over_runs]
esr = [success_rate_per_task_over_runs[k]['esr'] for k in success_rate_per_task_over_runs]
avg_asr = '{:.3f}'.format(sum(float(x) for x in asr) / len(asr))
avg_esr = '{:.3f}'.format(sum(float(x) for x in esr) / len(esr))
print(f"agent: {avg_asr} | eval: {avg_esr} | {len(only_uids)}")

agent: 0.851 | eval: 0.779 | 8


# Avg over tasks and then over runs

In [10]:
success_rate_per_run_over_tasks = {}
mismatch_files = []

for ev in evals_finished:
    if ev['uid'] not in success_rate_per_run_over_tasks:
        success_rate_per_run_over_tasks[ev['uid']] = {
            'agent_success': 0,
            'eval_success': 0,
            'total_tasks': 0,
            'mismatch': 0,
            'duration': 0,
            'input_tokens': 0,
            'output_tokens': 0,
            'input_steps': 0,
            'output_steps': 0,
        }
    msm = 1 if ev['agent_score'] == 'success' and ev['eval_score'] != 'success' else 0
    success_rate_per_run_over_tasks[ev['uid']]['total_tasks'] += 1
    success_rate_per_run_over_tasks[ev['uid']]['agent_success'] += 1 if ev['agent_score'] == 'success' else 0
    success_rate_per_run_over_tasks[ev['uid']]['eval_success'] += 1 if ev['eval_score'] == 'success' else 0
    success_rate_per_run_over_tasks[ev['uid']]['duration'] += ev['duration']
    success_rate_per_run_over_tasks[ev['uid']]['mismatch'] += msm
    if msm == 1: mismatch_files.append(ev['summary_file'])
    if 'steps' in ev:
        for step in ev['steps']:
            if 'llm_calls' in step and len(step['llm_calls']) > 0:
                if 'input_tokens' in step['llm_calls'][0]:
                    success_rate_per_run_over_tasks[ev['uid']]['input_tokens'] += step['llm_calls'][0]['input_tokens']
                    success_rate_per_run_over_tasks[ev['uid']]['input_steps'] += 1
                if 'output_tokens' in step['llm_calls'][0]:
                    success_rate_per_run_over_tasks[ev['uid']]['output_tokens'] += step['llm_calls'][0]['output_tokens']
                    success_rate_per_run_over_tasks[ev['uid']]['output_steps'] += 1

for k, v in success_rate_per_run_over_tasks.items():
    v['asr'] = '{:.3f}'.format(v['agent_success'] / v['total_tasks'])
    v['esr'] = '{:.3f}'.format(v['eval_success'] / v['total_tasks'])
    v['al'] = '{:.3f}'.format(float(v['asr']) / float(v['esr']))
    v['msm'] = v['mismatch']
    v['time'] = '{:.0f}s'.format(round(v['duration'] / v['total_tasks'], 0))
    # pop _success from the dict.
    v.pop('agent_success', None)
    v.pop('eval_success', None)
    v.pop('total_tasks', None)
    v.pop('mismatch', None)
    v.pop('duration', None)
    v.pop('input_tokens', None)
    v.pop('output_tokens', None)
    v.pop('input_steps', None)
    v.pop('output_steps', None)

In [11]:
r = dict(sorted(success_rate_per_run_over_tasks.items(), key=lambda x: float(x[1]['asr']), reverse=True))
df = pd.DataFrame.from_dict(r, orient='index')
display(df)

Unnamed: 0,asr,esr,al,msm,time
1743001170-0,0.929,0.857,1.084,3,47s
1743001170-3,0.867,0.767,1.13,3,50s
1743001170-4,0.867,0.8,1.084,3,51s
1743001170-6,0.867,0.733,1.183,4,45s
1743001170-1,0.862,0.759,1.136,3,47s
1743001170-7,0.857,0.893,0.96,1,47s
1743001170-2,0.828,0.759,1.091,2,45s
1743001170-5,0.821,0.75,1.095,3,49s


In [12]:
asr = [success_rate_per_run_over_tasks[k]['asr'] for k in success_rate_per_run_over_tasks]
esr = [success_rate_per_run_over_tasks[k]['esr'] for k in success_rate_per_run_over_tasks]
avg_asr = '{:.3f}'.format(sum(float(x) for x in asr) / len(asr))
avg_esr = '{:.3f}'.format(sum(float(x) for x in esr) / len(esr))
print(f"agent: {avg_asr} | eval: {avg_esr} | {len(only_uids)}")

agent: 0.862 | eval: 0.790 | 8
