In [None]:
%reload_ext autoreload
%autoreload 2

In [None]:
import json
import numpy as np
from importlib import import_module
from pathlib import Path
from utils_shared.tracing_storing import run_tool_eval
from utils_shared.tracing_storing import EvalParams

In [None]:
project = "finqa_model_text"

In [None]:
client_module = import_module(f"build_hour.{project}.utils_tools.openai_client")
client = client_module.client

In [None]:
# load the datasets used for training
train_file = f"build_hour/{project}/data/{project}_train.jsonl"
test_file = f"build_hour/{project}/data/{project}_val.jsonl"

with open(train_file, "r") as f:
    train_items = [json.loads(line) for line in f]

with open(test_file, "r") as f:
    test_items = [json.loads(line) for line in f]

In [None]:
# load the text format used for training
text_format_module = import_module(f"build_hour.{project}.utils_tools.text_format")
RESPONSE_FORMAT_RESPONSES = text_format_module.RESPONSE_FORMAT_RESPONSES

In [None]:
# load the graders used for training
graders_module = import_module(f"build_hour.{project}.utils_tools.graders")
GRADERS = graders_module.GRADERS

In [None]:
# load the tools used for training
tools_module = import_module(f"build_hour.{project}.utils_tools.tools")
TOOL_NAME_TO_FUNC = tools_module.TOOL_NAME_TO_FUNC
TOOLS_RESPONSES = tools_module.TOOLS_RESPONSES

In [None]:
TOOLS_RESPONSES

In [None]:
model = "gpt-5"

reasoning_effort = "medium"
run_name = f"{model}-{reasoning_effort}"

eval_params = EvalParams(
    project=project,
    run_name=run_name,
    model=model,
    reasoning_effort=reasoning_effort,
    graders=GRADERS,
    text={"format": RESPONSE_FORMAT_RESPONSES},
    tools=TOOLS_RESPONSES,
    tool_name_to_func=TOOL_NAME_TO_FUNC,
)


In [None]:
len(train_items), len(test_items)

In [None]:
for _ in range(1):
    results, output_jsonl = run_tool_eval(
        items=test_items[:5],
        eval_params=eval_params,
        client=client,
        verbose=False,
        max_workers=16,
    )

In [None]:
# Gather all .jsonl files under tool_evals recursively
jsonl_files = list(
    Path(
        f"build_hour/{eval_params.project}/tool_evals/{eval_params.run_name}/" # you can add the evalrun_id here if you want to access a specific run's results
    ).rglob("results.jsonl")
)
print(jsonl_files)

# For each grader, collect a list of per-run means
grader_run_means = {grader_name: [] for grader_name in GRADERS.keys()}
grader_all_scores = {grader_name: [] for grader_name in GRADERS.keys()}  # Collect all scores for each grader
run_durations = []
all_durations = []  # Collect all durations across all runs

for i, jsonl_path in enumerate(jsonl_files):
    print(f"Processing {i+1}/{len(jsonl_files)}")
    run_scores = {grader_name: [] for grader_name in GRADERS.keys()}
    run_durs = []
    with open(jsonl_path, "r", encoding="utf-8") as f:
        for line in f:
            obj = json.loads(line)
            metrics = obj.get("metrics", {})
            for grader in run_scores.keys():
                value = metrics.get(grader)
                if isinstance(value, float):
                    run_scores[grader].append(value)
                    grader_all_scores[grader].append(value)
                elif isinstance(value, dict):
                    reward = value.get("reward")
                    if isinstance(reward, float):
                        run_scores[grader].append(reward)
                        grader_all_scores[grader].append(reward)
            duration = obj.get("duration_seconds")
            if duration is not None:
                run_durs.append(duration)
                all_durations.append(duration)  # Add to the global list
    # Compute mean for each grader in this run and store
    for grader, values in run_scores.items():
        if len(values) > 0:
            grader_run_means[grader].append(np.mean(values))
    # Compute mean duration for this run
    if len(run_durs) > 0:
        run_durations.append(np.mean(run_durs))

# Now compute mean and std of the per-run means for each grader
for grader, means in grader_run_means.items():
    arr = np.array(means)
    print(f"{grader} (mean of run means): {np.mean(arr):.3f} ± {np.std(arr):.3f}")

arr_durations = np.array(run_durations)
print(f"Duration (seconds, mean of run means): {np.mean(arr_durations):.2f} ± {np.std(arr_durations):.2f}")

# Also provide the array of all durations
all_durations = np.array(all_durations)
print(f"All durations array shape: {all_durations.shape}")

# Also provide the arrays of all scores for each grader
for grader, scores in grader_all_scores.items():
    grader_all_scores[grader] = np.array(scores)
    print(f"All scores for {grader}: shape {grader_all_scores[grader].shape}")


In [None]:
import plotly.express as px
import numpy as np

# Define 30 bins between 0 and 1 (31 edges)
bin_edges = np.linspace(0, 1, 31)

fig = px.histogram(
    grader_all_scores[grader],
    x=grader_all_scores[grader],
    title=f"Distribution of scores for {grader}",
    labels={'x': 'Score', 'y': 'Frequency'},
    nbins=None
)
fig.update_traces(xbins=dict(
    start=bin_edges[0],
    end=bin_edges[-1],
    size=(bin_edges[1] - bin_edges[0])
))
fig.update_layout(
    bargap=0.1, 
    template="plotly_white",
    font=dict(family="Helvetica Neue, Helvetica, Arial, sans-serif", size=16),
    showlegend=False,
    xaxis=dict(range=[0, 1])
)
fig.show()

In [None]:
import plotly.express as px

fig = px.histogram(
    all_durations,
    nbins=30,
    title="Distribution of durations",
    labels={'value': 'Duration (s)', 'count': 'Frequency'}
)
fig.update_layout(
    bargap=0.1,
    template="plotly_white",
    font=dict(family="Helvetica Neue, Helvetica, Arial, sans-serif", size=16),
    showlegend=False
)
fig.show()