In [1]:
%load_ext autoreload
%autoreload 2

In [24]:
import json
import re

from collections import defaultdict, Counter
from pathlib import Path

import numpy as np
import pandas as pd

In [15]:
from constants import (
    PROJECT_ROOT,
    LOGS_DIR,
    GPT_4O_MINI,
    GPT_5_MINI,
)

In [4]:
!ls {LOGS_DIR / "actions/full"}

full_actions_gpt-5-mini_2025-08-07.jsonl


In [5]:
def load_jsonl(file_path: Path) -> list:
    data = []
    with open(file_path, 'r') as f:
        for line in f:
            data.append(json.loads(line))
    return data

In [6]:
# log_file_parsing
# full_actions_gpt-5-mini_2025-08-07.jsonl
# {setting}_{task}_{model}.jsonl
# log_file_pattern = re.compile(r'^(?P<setting>.+)_(?P<task>.+)_(?P<model>.+)\.jsonl$')
# need to rework the regex: only "model" can have underscores
log_file_pattern = re.compile(
    r'^(?P<setting>[^_]+)_(?P<task>[^_]+)_(?P<model>.+)\.jsonl$'
)

def parse_log_directory(log_dir: Path, testing: bool = False) -> defaultdict:
    # task, setting, model -> outputs
    # all_outputs = defaultdict(lambda: defaultdict(lambda: defaultdict(list)))
    all_outputs: dict[str, dict[str, dict[str, list]]] = {}
    for task_dir in log_dir.iterdir():
        if not task_dir.is_dir():
            continue
        task = task_dir.name
        if task not in all_outputs:
            all_outputs[task] = {}
        for setting_dir in task_dir.iterdir():
            if not setting_dir.is_dir():
                continue
            setting = setting_dir.name
            if setting not in all_outputs[task]:
                all_outputs[task][setting] = {}
            for file_path in setting_dir.iterdir():
                if file_path.suffix != '.jsonl':
                    continue
                # file path format:
                # full_actions_gpt-5-mini_2025-08-07.jsonl
                # {setting}_{task}_{model}.jsonl
                

            
                match = log_file_pattern.match(file_path.name)
                if not match:
                    print(f"Skipping unrecognized file: {file_path}")
                    continue
                if testing:
                    print(file_path.name)
                    print(f"groups: {match.groupdict()}")
                    return
                model = match.group('model')
                outputs = load_jsonl(file_path)
                all_outputs[task][setting][model] = outputs
    return all_outputs

In [9]:
# parsed_logs = parse_log_directory(LOGS_DIR, testing=True)
parsed_logs = parse_log_directory(LOGS_DIR)

In [11]:
parsed_logs.keys()

dict_keys(['summary', 'math', 'data2text', 'code', 'actions'])

In [12]:
parsed_logs["math"].keys()

dict_keys(['sharded', 'full'])

In [14]:
parsed_logs["math"]["full"].keys()

dict_keys(['gpt-5-mini_2025-08-07'])

In [20]:
dummy = parsed_logs["math"]["full"][GPT_5_MINI]
print(f"type: {type(dummy)},\nlength: {len(dummy)},\nfirst entry:\n{dummy[0].keys()}")

type: <class 'list'>,
length: 103,
first entry:
dict_keys(['conv_id', 'conv_type', 'task', 'task_id', 'dataset_fn', 'assistant_model', 'system_model', 'user_model', 'git_version', 'trace', 'is_correct', 'score'])


## Scoring Script

In [22]:
# score dummy first
total_entries = len(dummy)
total_score = 0
for entry in dummy:
    total_score += entry.get("score", 0)
average_score = total_score / total_entries
print(f"Average score for {GPT_5_MINI} on math/full: {average_score:.2f} over {total_entries} entries")

Average score for gpt-5-mini_2025-08-07 on math/full: 0.88 over 103 entries


In [26]:
# scoring function for list[traces]
def score_task_entries(
    entries: list[dict],
    score_key: str = "score"
) -> tuple[float, int, int]:
    total_entries = len(entries)
    if total_entries == 0:
        return 0.0
    total_score = 0
    none_count = 0
    for entry in entries:
        score = entry.get(score_key, 0)
        if score is None:
            none_count += 1
        else:
            total_score += score
    average_score = total_score / total_entries
    return average_score, none_count, total_entries

In [None]:
# create a pandas DataFrame of scores
# columns: task
# rows: setting
# for now, fix model to GPT_5_MINI

df = pd.DataFrame()
for task, settings in parsed_logs.items():
    for setting, models in settings.items():
        entries = models.get(GPT_5_MINI, [])
        avg_score, none_counts, total_entries = score_task_entries(entries)
        df.loc[setting, task] = avg_score
df.index.name = "Setting"
df.columns.name = "Task"
df = df.astype(float)
df

Task,summary,math,data2text,code,actions
Setting,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
sharded,0.080628,0.627451,0.274894,0.367347,0.52381
full,0.109325,0.883495,0.37626,0.49,0.885714


In [29]:
print(df.to_markdown())

| Setting   |   summary |     math |   data2text |     code |   actions |
|:----------|----------:|---------:|------------:|---------:|----------:|
| sharded   | 0.0806277 | 0.627451 |    0.274894 | 0.367347 |  0.52381  |
| full      | 0.109325  | 0.883495 |    0.37626  | 0.49     |  0.885714 |
