In [None]:
import os

# Get the absolute path of the current script
current_path = os.getcwd()

# Move up directories until we find the project's root (assumed to have a .git folder or another marker)
while (
    not os.path.exists(os.path.join(current_path, '.git'))
    and os.path.dirname(current_path) != current_path
):
    current_path = os.path.dirname(current_path)  # Move one level up

# Change the working directory to the detected root
os.chdir(current_path)

print(f'Changed working directory to: {os.getcwd()}')

In [None]:
import json
import os
import re

import pandas as pd

# Define the directory to search
base_dir = 'logs/step_tasks'

In [None]:
# Regular expressions for filtering subfolders and JSON files
subfolder_pattern = re.compile(r'step_\d+_openhands_\d{4}-\d{2}-\d{2}_\d{2}-\d{2}')
json_file_pattern = re.compile(r'\d{4}-\d{2}-\d{2}_\d{2}-\d{2}_metrics\.json$')

In [None]:
# List to store data
data_list = []

# Walk through the directory
for root, dirs, files in os.walk(base_dir):
    # Check if the current directory matches the required subfolder pattern
    if not subfolder_pattern.search(os.path.basename(root)):
        continue  # Skip directories that don’t match

    for file in files:
        if json_file_pattern.match(file):  # Match JSON files
            file_path = os.path.join(root, file)

            try:
                with open(file_path, 'r', encoding='utf-8') as f:
                    json_data = json.load(f)

                    # Flatten JSON and add metadata
                    # json_data["source_file"] = file
                    # json_data["source_folder"] = os.path.basename(root)

                    data_list.append(json_data)

            except (json.JSONDecodeError, OSError) as e:
                print(f'Error reading {file_path}: {e}')

In [None]:
# Create DataFrame
df = pd.DataFrame(data_list)

In [None]:
df.head()

In [None]:
df.tail()

In [None]:
cols = df.columns.tolist()
for col in cols:
    print(f"'{col}',")

In [None]:
df_filtered = df[
    [
        'agent_name',
        'difficulty',
        'task_id',
        'query',
        'final_answer',
        'success',
        'checkpoint_provided_ratio',
        'checkpoint_expected_ratio',
        'checkpoints',
        'model_name',
        'screenshots',
        'model_calls',
        'input_tokens',
        'output_tokens',
        'total_tokens',
        'full_runtime',
        'text_model',
        'vision_model',
        'multi_agent',
        'timestamp',
    ]
]
# df_filtered sort by difficulty and task_id
df_filtered = df_filtered.sort_values(by=['agent_name', 'difficulty', 'task_id'])
df_filtered.head()

In [None]:
df_filtered.head(30)

In [None]:
df_filtered.tail()

In [None]:
df_filtered.agent_name.unique()

In [None]:
# df_filtered to csv
df_filtered.to_csv('logs/step_tasks/step_tasks.csv', index=False, sep=';')

In [None]:
df_filtered.columns

### Summary of stepwise tasks

In [None]:
# aggregate per 'difficulty'

# add column "number of tasks" how many tasks per difficulty were used to calculate the mean
df_summary = (
    df_filtered.groupby(['model_name', 'agent_name', 'difficulty'])
    .agg(
        success=('success', 'mean'),
        checkpoint_provided_ratio=('checkpoint_provided_ratio', 'mean'),
        checkpoint_expected_ratio=('checkpoint_expected_ratio', 'mean'),
        model_calls=('model_calls', 'mean'),
        total_tokens=('total_tokens', 'mean'),
        full_runtime=('full_runtime', 'mean'),
        number_of_tasks=('task_id', 'size'),
    )
    .round(2)
)
# rename "success" column to "success_rate"
df_summary.rename(columns={'success': 'success_rate'}, inplace=True)
# append "_mean" to all columns that are mean values
df_summary.columns = [
    f'{col}_mean' if col != 'number_of_tasks' else col for col in df_summary.columns
]
df_summary.head(10)

In [None]:
df_summary.to_csv('logs/step_tasks/step_tasks_summary.csv', sep=';')

### Steps

In [None]:
import json

import pandas as pd

In [None]:
with open(checkpoints_path, 'r', encoding='utf-8') as file:
    return json.load(file)

In [None]:
checkpoints_path = 'step_wise_evaluation/steps_definition.json'  # Update with actual checkpoint file path
with open(checkpoints_path, 'r', encoding='utf-8') as file:
    checkpoints_data = json.load(file)

In [None]:
checkpoints_data

In [None]:
# iterate through checkpoints_data and print the keys:
for key in checkpoints_data.keys():
    (print(key),)

In [None]:
# iterate through all keys and print the "entries"
for key in checkpoints_data.keys():
    print(key)
    for entry in checkpoints_data[key]['entries']:
        print(f"{entry['query']}")

In [None]:
# for every key in checkpoints_data, print "goals"
for key in checkpoints_data.keys():
    print(key)
    print(checkpoints_data[key]['goal'])