In [19]:
import inspect
import os
import sys
from pathlib import Path

import analysis
import pandas as pd

# In Jupyter, __file__ is not defined, so use the current working directory
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import matcher

from frankenstein.tools import arithmetic, data_retrieval

ARITHMETIC_TOOL_NAMES = [name for name, _ in inspect.getmembers(arithmetic, predicate=inspect.isfunction)]
DATA_TOOL_NAMES = [name for name, _ in inspect.getmembers(data_retrieval, predicate=inspect.isfunction)]

run_dir = Path('runs')
dfs = {f.stem: pd.read_json(f, orient='records', lines=True, precise_float=True) for f in run_dir.iterdir()}
print(f'Found {len(dfs)} runs in {run_dir}:')
for name in dfs:
    print(f'  {name}')

m = matcher.Matcher()

Found 54 runs in runs:
  gpt-4o-mini_answerable-full_all-tools_3-shot
  Qwen3-14B_answerable-full_data-tools_0-shot
  Qwen3-32B_answerable-full_all-tools_1-shot
  Llama-3.2-3B-Instruct_answerable-full_data-tools_0-shot
  Mistral-Small-3.1-24B_answerable-full_data-tools_0-shot
  Qwen3-32B_answerable-partial_all-tools_0-shot
  Qwen3-14B_answerable-full_all-tools_0-shot
  gpt-4.1-mini_answerable-full_all-tools_1-shot
  Llama-3.1-70B-Instruct_answerable-full_all-tools_1-shot
  Qwen3-4B_answerable-partial_all-tools_0-shot
  Qwen3-32B_answerable-full_data-tools_0-shot
  gpt-4o-mini_answerable-full_data-tools_0-shot
  Mistral-Small-3.1-24B_answerable-partial_all-tools_0-shot
  Qwen3-4B_answerable-full_all-tools_3-shot
  Llama-3.1-70B-Instruct_answerable-partial_all-tools_0-shot
  Qwen3-14B_answerable-full_all-tools_1-shot
  Llama-3.3-70B-Instruct_answerable-full_all-tools_1-shot
  Qwen3-32B_answerable-full_all-tools_0-shot
  Llama-3.1-8B-Instruct_answerable-full_data-tools_0-shot
  Llama-3.1-

In [20]:
df = dfs['Qwen3-14B_answerable-full_all-tools_0-shot']

In [21]:
df.head(1)

Unnamed: 0,id,question_template,question,actions,answer,slot_values,answerable,data_availability,answer_format,messages,tokens,pred,correct,error
0,859b3565-fdd3-4c76-9c40-a2fa528ec406,AverageChange,What was the average yearly change in the perc...,"[{'name': 'get_country_code_from_name', 'argum...",-0.48044,"{'property': 'SE.PRE.ENRR', 'subject': 'BEL', ...",True,full,float,"[{'role': 'system', 'content': 'You are a help...",4236,-0.48044,True,0.0


In [22]:
df['gold_tool_calls'] = df.apply(lambda row: analysis.get_gold_tool_calls(row, []), axis=1)
df['pred_tool_calls'] = df.apply(lambda row: analysis.get_pred_tool_calls(row), axis=1)
df['true_positives'] = df.apply(lambda row: analysis.get_true_positives(row), axis=1)
df['false_positives'] = df.apply(lambda row: analysis.get_false_positives(row), axis=1)


In [23]:
df['actions'].iloc[0]

[{'name': 'get_country_code_from_name',
  'arguments': {'country_name': 'Belgium'},
  'result': 'BEL',
  'id': None},
 {'name': 'search_for_indicator_names',
  'arguments': {'keywords': 'School enrollment, preprimary (% gross)'},
  'result': [{'indicator_name': 'School enrollment, preprimary (% gross)',
    'indicator_description': 'Gross enrollment ratio is the ratio of total enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown. Preprimary education refers to programs at the initial stage of organized instruction, designed primarily to introduce very young children to a school-type environment and to provide a bridge between home and school.'}],
  'id': None},
 {'name': 'get_indicator_code_from_name',
  'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'},
  'result': 'SE.PRE.ENRR',
  'id': None},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BEL',
   'indicator_code': 'SE.PRE

In [24]:
df['gold_tool_calls'].iloc[0], len(df['gold_tool_calls'].iloc[0])

([{'name': 'get_country_code_from_name',
   'arguments': {'country_name': 'Belgium'}},
  {'name': 'search_for_indicator_names',
   'arguments': {'keywords': 'School enrollment, preprimary (% gross)'}},
  {'name': 'get_indicator_code_from_name',
   'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2009'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2010'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2011'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2012'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2013'}},
  {'name': 'retrieve_value',
   'ar

In [25]:
df['true_positives'].iloc[1], len(df['true_positives'].iloc[1])

([{'name': 'get_country_code_from_name',
   'arguments': {'country_name': 'Saint Vincent and the Grenadines'}},
  {'name': 'search_for_indicator_names',
   'arguments': {'keywords': 'Lending interest rate (%)'}},
  {'name': 'get_indicator_code_from_name',
   'arguments': {'indicator_name': 'Lending interest rate (%)'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'VCT',
    'indicator_code': 'FR.INR.LEND',
    'year': '2003'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'VCT',
    'indicator_code': 'FR.INR.LEND',
    'year': '2004'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'VCT',
    'indicator_code': 'FR.INR.LEND',
    'year': '2005'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'VCT',
    'indicator_code': 'FR.INR.LEND',
    'year': '2006'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'VCT',
    'indicator_code': 'FR.INR.LEND',
    'year': '2007'}},
  {'name': 'retrieve_value',
   'argum

In [26]:
df['false_positives'].iloc[1]

[{'name': 'add',
  'arguments': {'values': [-2.13975,
    -0.33786,
    -0.1191,
    -0.1085,
    -0.08816,
    0.14755]}},
 {'name': 'divide', 'arguments': {'value_a': -2.64582, 'value_b': 6}}]

In [27]:
df['coverage'] = df.apply(analysis.get_coverage, axis=1)
df['recall'] = df.apply(analysis.get_recall, axis=1)

In [28]:
df['recall'].iloc[0]

np.float64(0.5217391304347826)

In [29]:
df['precision'] = df.apply(analysis.get_precision, axis=1)
df['error_made'] = df.apply(analysis.get_error_made, axis=1)
# df['correct_indicator_data_process'] = df.apply(analysis.get_correct_indicator_data_process, axis=1)

In [30]:
df['correct_indicator_data_process'] = df.apply(analysis.get_correct_indicator_data_process, axis=1)

In [31]:
df['correct_indicator_data_process'].value_counts()

correct_indicator_data_process
True     262
False    138
Name: count, dtype: int64

In [32]:
df['missing_tool_calls'] = df.apply(analysis.get_missing_tool_calls, axis=1)
df['additional'] = df.apply(analysis.get_additional_tool_calls, axis=1)

In [37]:
correct = df[df['correct'] == True]
df_exploded = correct.explode('additional')
df_exploded['additional'] = df_exploded['additional'].apply(lambda x: x.get('name') if isinstance(x, dict) else 'unknown')
df_exploded['additional'].value_counts()

additional
retrieve_value                  155
unknown                         136
get_country_code_from_name       23
get_indicator_code_from_name     22
divide                           19
add                              15
search_for_indicator_names       13
greater_than                     13
count                             9
rank                              4
subtract                          1
sort                              1
Name: count, dtype: int64

In [34]:
# Take an overview of which functions comprised the false positives
# First we need to explode the false positives column
df_exploded = df.explode('false_positives')
df_exploded['false_positives'] = df_exploded['false_positives'].apply(
    lambda x: x.get('name', 'unknown') if isinstance(x, dict) else 'unknown'
)
df_exploded['false_positives'].value_counts()

false_positives
retrieve_value                  569
unknown                         201
get_indicator_code_from_name     83
subtract                         42
divide                           42
get_country_code_from_name       39
add                              27
search_for_indicator_names       24
greater_than                     22
get_country_name_from_code       11
count                            10
mean                              8
rank                              7
minimum                           7
index                             6
sort                              4
maximum                           4
multiply                          3
get_country_codes_in_region       2
Name: count, dtype: int64

In [42]:
pd.crosstab(
    df['correct'],
    df['error_made'],
    rownames=['Correct Answer Given'],
    colnames=['Error Made'],
).round(2)

Error Made,False,True
Correct Answer Given,Unnamed: 1_level_1,Unnamed: 2_level_1
False,121,48
True,203,28


In [36]:
pd.crosstab(
    df['correct'],
    df['correct_indicator_data_process'],
    rownames=['Correct Answer Given'],
    colnames=['Correct Indicator Data Process'],
)

Correct Indicator Data Process,False,True
Correct Answer Given,Unnamed: 1_level_1,Unnamed: 2_level_1
False,90,79
True,48,183
