In [1]:
import inspect
import os
import sys
from pathlib import Path

import analysis
import pandas as pd

# In Jupyter, __file__ is not defined, so use the current working directory
sys.path.insert(0, os.path.abspath(os.path.join(os.getcwd(), '..')))

import matcher

from frankenstein.tools import arithmetic, data_retrieval

ARITHMETIC_TOOL_NAMES = [name for name, _ in inspect.getmembers(arithmetic, predicate=inspect.isfunction)]
DATA_TOOL_NAMES = [name for name, _ in inspect.getmembers(data_retrieval, predicate=inspect.isfunction)]

run_dir = Path('runs')
dfs = {f.stem: pd.read_json(f, orient='records', lines=True, precise_float=True) for f in run_dir.iterdir()}
print(f'Found {len(dfs)} runs in {run_dir}:')
for name in dfs:
    print(f'  {name}')

m = matcher.Matcher()

Found 50 runs in runs:
  Mistral-Small-3.1-24B_answerable-full_all-tools_0-shot
  Llama-3.1-8B-Instruct_answerable-full_all-tools_1-shot
  Qwen3-14B_answerable-full_data-tools_0-shot
  Qwen3-14B_answerable-full_all-tools_0-shot
  Llama-3.2-3B-Instruct_answerable-full_all-tools_3-shot
  Qwen3-4B_answerable-full_data-tools_0-shot
  Qwen3-32B_answerable-full_all-tools_1-shot
  Mistral-Small-3.1-24B_answerable-partial_all-tools_0-shot
  gpt-4.1-mini_answerable-full_all-tools_1-shot
  Llama-3.1-70B-Instruct_answerable-full_all-tools_3-shot
  gpt-4o-mini_answerable-full_all-tools_1-shot
  Llama-3.3-70B-Instruct_answerable-partial_all-tools_0-shot
  Llama-3.1-8B-Instruct_answerable-full_data-tools_0-shot
  Llama-3.2-3B-Instruct_answerable-full_data-tools_0-shot
  Llama-3.1-8B-Instruct_answerable-full_all-tools_3-shot
  Qwen3-32B_answerable-full_data-tools_0-shot
  Qwen3-4B_answerable-full_all-tools_0-shot
  Llama-3.3-70B-Instruct_answerable-full_all-tools_0-shot
  Qwen3-14B_answerable-partial

In [2]:
df = dfs['Llama-3.1-70B-Instruct_answerable-full_all-tools_0-shot']

In [3]:
df.head(1)

Unnamed: 0,id,question_template,question,actions,answer,slot_values,answerable,data_availability,answer_format,messages,tokens,pred,correct,error
0,859b3565-fdd3-4c76-9c40-a2fa528ec406,AverageChange,What was the average yearly change in the perc...,"[{'name': 'get_country_code_from_name', 'argum...",-0.48044,"{'property': 'SE.PRE.ENRR', 'subject': 'BEL', ...",True,full,float,"[{'role': 'system', 'content': 'You are a help...",2996,No data is available for country code,False,100.0


In [4]:
df['gold_tool_calls'] = df.apply(analysis.get_gold_tool_calls, axis=1)
df['pred_tool_calls'] = df.apply(analysis.get_pred_tool_calls, axis=1)
df['true_positives'] = df.apply(analysis.get_true_positives, axis=1)
df['false_positives'] = df.apply(analysis.get_false_positives, axis=1)


In [5]:
df['actions'].iloc[0]

[{'name': 'get_country_code_from_name',
  'arguments': {'country_name': 'Belgium'},
  'result': 'BEL',
  'id': None},
 {'name': 'search_for_indicator_names',
  'arguments': {'keywords': 'School enrollment, preprimary (% gross)'},
  'result': [{'indicator_name': 'School enrollment, preprimary (% gross)',
    'indicator_description': 'Gross enrollment ratio is the ratio of total enrollment, regardless of age, to the population of the age group that officially corresponds to the level of education shown. Preprimary education refers to programs at the initial stage of organized instruction, designed primarily to introduce very young children to a school-type environment and to provide a bridge between home and school.'}],
  'id': None},
 {'name': 'get_indicator_code_from_name',
  'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'},
  'result': 'SE.PRE.ENRR',
  'id': None},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BEL',
   'indicator_code': 'SE.PRE

In [6]:
df['gold_tool_calls'].iloc[0], len(df['gold_tool_calls'].iloc[0])

([{'name': 'get_country_code_from_name',
   'arguments': {'country_name': 'Belgium'}},
  {'name': 'search_for_indicator_names',
   'arguments': {'keywords': 'School enrollment, preprimary (% gross)'}},
  {'name': 'get_indicator_code_from_name',
   'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2009'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2010'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2011'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2012'}},
  {'name': 'retrieve_value',
   'arguments': {'country_code': 'BEL',
    'indicator_code': 'SE.PRE.ENRR',
    'year': '2013'}},
  {'name': 'retrieve_value',
   'ar

In [7]:
df['true_positives'].iloc[0], len(df['true_positives'].iloc[0])

([{'name': 'get_country_code_from_name',
   'arguments': {'country_name': 'Belgium'}},
  {'name': 'search_for_indicator_names',
   'arguments': {'keywords': 'School enrollment, preprimary (% gross)'}},
  {'name': 'get_indicator_code_from_name',
   'arguments': {'indicator_name': 'School enrollment, preprimary (% gross)'}}],
 3)

In [8]:
df['false_positives'].iloc[0]

[{'name': 'think',
  'arguments': {'thought': 'To calculate the average yearly change in the percentage of children enrolled in preprimary education in Belgium for each year between 2009 and 2018, we need to first retrieve the values of the indicator for each year. Then, we can calculate the differences between consecutive years and find the average of these differences.'}},
 {'name': 'get_indicator_code_from_name',
  'arguments': {'indicator_name': 'Percentage of children enrolled in preprimary education'}},
 {'name': 'retrieve_value',
  'arguments': {'country_code': 'BEL',
   'indicator_code': 'SE.PRE.ENRR',
   'year': [2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016, 2017, 2018]}}]

In [9]:
df['coverage'] = df.apply(analysis.get_coverage, axis=1)
df['recall'] = df.apply(analysis.get_recall, axis=1)

In [10]:
df['recall'].iloc[0]

np.float64(0.13043478260869565)

In [11]:
df['precision'] = df.apply(analysis.get_precision, axis=1)
df['error_made'] = df.apply(analysis.get_error_made, axis=1)
df['no_search_for_indicator_names'] = df.apply(analysis.get_no_search_for_indicator_names, axis=1)

In [12]:
pd.crosstab(
    df['correct'],
    df['error_made'],
    rownames=['Correct Answer Given'],
    colnames=['Error Made'],
)

Error Made,False,True
Correct Answer Given,Unnamed: 1_level_1,Unnamed: 2_level_1
False,171,123
True,75,31


In [13]:
pd.crosstab(
    df['correct'],
    df['no_search_for_indicator_names'],
    rownames=['Correct Answer Given'],
    colnames=['No Search for Indicator Names'],
)

No Search for Indicator Names,False,True
Correct Answer Given,Unnamed: 1_level_1,Unnamed: 2_level_1
False,101,193
True,30,76
