In [54]:
from pathlib import Path

import pandas as pd

In [55]:
LOG_PATH = Path("..", "log").with_suffix(".jsonl")

In [56]:
def get_all_files_matching(**kwargs):
    """
    Extract all rows where the kwargs are matched.
    """
    log_files = pd.read_json(LOG_PATH, lines=True)
    for key, value in kwargs.items():
        log_files = log_files[log_files[key] == value]
    return log_files

In [57]:
files = get_all_files_matching(
    model_name="/public/hf/models/meta-llama/Meta-Llama-3.1-8B-Instruct/",
    split="answerable",
)

In [58]:
Path(files["filename"][0]).name

'2025-05-08T19:25:53.jsonl'

In [59]:
p = Path("..", "runs", f"{Path(files['filename'][0]).name}")
with p.open("r") as f:
    df = pd.read_json(f, lines=True)

In [None]:
def get_model_final_answer(row):
    # Get the final answer from the row
    messages = row["messages"].to_dict()[row.index[0]]
    # Get the messages where 'role' = 'tool' and 'name' = 'final_answer'
    final_answer = [
        msg
        for msg in messages
        if msg["role"] == "tool" and msg["name"] == "final_answer"
    ]
    if final_answer:
        # Get the content of the final answer
        final_answer_content = final_answer[0]["content"]
        # Convert the content to a string
        final_answer_str = str(final_answer_content)
        return final_answer_str
    else:
        # If no final answer is found, return None
        return None


def get_gold_final_answer(row):
    # Get the final answer from the actions sequence
    actions = row["actions"].to_dict()[row.index[0]]
    # Get the actions where 'name' = 'final_answer'
    final_answer = [action for action in actions if action["name"] == "final_answer"]
    return final_answer["result"]


def get_model_tool_calls(
    row,
    clean=False,
):
    # Get the tool calls from the row
    messages = row["messages"].to_dict()[row.index[0]]

    # Get the messages where 'role' = 'tool'
    tool_calls = [
        msg["tool_calls"][0]["function"]
        for msg in messages
        if msg["role"] == "assistant" and "tool_calls" in msg
    ]

    # Arguments is a string representation of a dictionary, convert it to a dictionary
    for i in range(len(tool_calls)):
        tool_calls[i]["arguments"] = eval(tool_calls[i]["arguments"])

    # If clean, remove 'think' tool calls
    if clean:
        tool_calls = [call for call in tool_calls if call["name"] != "think"]

    return tool_calls


def get_gold_tool_calls(
    row,
    clean=False,
):
    # Get the tool calls from the actions sequence
    actions = row["actions"].to_dict()[row.index[0]]

    # Each tool call contains a 'result' key, drop it from the actions
    for action in actions:
        if "result" in action:
            del action["result"]

    return actions

In [65]:
row = df.sample(1)
gold = get_gold_tool_calls(row)
model = get_model_tool_calls(row)

In [66]:
gold

[{'name': 'get_country_codes_in_region',
  'arguments': {'region_name': 'Eastern Europe'}},
 {'name': 'get_indicator_code_from_name',
  'arguments': {'indicator_name': 'Ores and metals exports (% of merchandise exports)'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BLR',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BGR',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'CZE',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'HUN',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'MDA',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'POL',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'

In [67]:
model

[{'name': 'think',
  'arguments': {'thought': 'To solve this problem, I need to first get the list of country codes in the Eastern Europe region. Then, I need to get the indicator code for Ores and metals exports (% of merchandise exports). After that, I need to retrieve the values of this indicator for each country in the region for the year 2020. Finally, I need to calculate the mean of these values.'}},
 {'name': 'get_country_codes_in_region',
  'arguments': {'region_name': 'Eastern Europe'}},
 {'name': 'get_indicator_code_from_name',
  'arguments': {'indicator_name': 'Ores and metals exports (% of merchandise exports)'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BLR',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BGR',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN',
   'year': '2020'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'CZE',
   'indicator_code': 'TX.VAL.MMTL.ZS.UN