In [11]:
import pandas as pd
from any_guardrail import AnyGuardrail, GuardrailName
import huggingface_hub
import json
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from collections import defaultdict

In [13]:
name = "Function Calling Judge"
criteria = """
Based on the user query and the response provided by the LLM, does the function call meet the following properties:

1. It provides enough parameters to execute the function call.
2. The chosen function call is relevant to the user task.
3. The chosen function is an available function for this task.

The function call must meet all three properties in order to be successful.
"""

rubric = {
    0: "The function call does not have enough parameters or it it is not relevant or it is does not exist in the tools available to the agent",
    1: "The function call meets one of the criteria: has enough parameters or has a call that is relevant to the task or the chosen function is in the tool list.",
    2: "The function all meets two of the criteria: has enough parameters or has a call that is relevant to the task or the chosen function is in the tool list.",
    3: "The function call meets all the criteria: : has enough parameters or has a call that is relevant to the task or the chosen function is in the tool list."
}

required_inputs = ['conversation']
required_output = "response"

flowjudge = AnyGuardrail.create(GuardrailName.FLOWJUDGE, name=name, criteria=criteria, rubric=rubric, required_inputs=required_inputs, required_output=required_output)

INFO:flow_judge.models.huggingface:Downloading the model from Hugging Face Hub using hf-transferfor faster downloads...


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [14]:
pass_criteria = """
Based on the INPUT and the OUTPUT provided by the LLM, does the function call meet the following properties:

1. It provides enough parameters to execute the function call.
2. The chosen function call is relevant to the user task.
3. The chosen function is an available function for this task.

The function call must meet all three properties in order to be successful.
"""

rubric = """
0: The function call does not have enough parameters or it it is not relevant or it is does not exist in the tools available to the OUTPUT
1: The function call meets one of the criteria: has enough parameters or has a call that is relevant to the task or the chosen function is in the OUTPUT.
2: The function all meets two of the criteria: has enough parameters or has a call that is relevant to the task or the chosen function is in the OUTPUT.
3: The function call meets all the criteria: : has enough parameters or has a call that is relevant to the task or the chosen function is in the OUTPUT.
"""

glider = AnyGuardrail.create(GuardrailName.GLIDER, pass_criteria=pass_criteria, rubric=rubric)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [6]:
with open("/home/dni138/data/single-turn.json", "r") as f:
    hammerbench_data = json.load(f)

In [19]:
len(hammerbench_data)

13054

In [23]:
def function_calling_experiment(model_dict, benchmark, keys=["messages", "tools"]):
    results = defaultdict(list)
    for model_name, model in model_dict.items():
        for row in benchmark:
            conversation = str(row["messages"])
            tools = str(row["tools"])
            if model_name == "FlowJudge":
                inputs = [{required_inputs[0]: conversation}]
                output = {required_output: tools}
                results[model_name].append(model.validate(inputs, output))
            elif model_name == "Glider":
                results[model_name].append(model.validate(conversation, tools))
            else:
                return None
            results["benchmark_id"].append(row["id"])

    return results

In [24]:
model_dict = {
    "FlowJudge": flowjudge,
    "Glider": glider
}

results = function_calling_experiment(model_dict, hammerbench_data)

The function call provided in the output does not meet the criteria specified in the evaluation rubric. 

1. It does not provide enough parameters to execute the function call. The function call for "ConvenientLiving.AutoLife.searchUsedCars" requires specific parameters such as 'brand', 'keywords', 'year', 'mileage', 'location', 'condition', 'price', 'transmission', 'fuel', 'options', 'extras', 'title', 'vehicle_id', 'vin', 'insurance', 'financing', 'dealer', 'dealer_location', 'dealer_name', 'dealer_phone', 'dealer_email', 'dealer_website', 'dealer_rating', 'dealer_reviews', 'dealer_location_description', 'dealer_address', 'dealer_city', 'dealer_state', 'dealer_zip', 'dealer_country', 'dealer_phone_number', 'dealer_fax', 'dealer_email', 'dealer_website', 'dealer_rating', 'dealer_reviews', 'dealer_location_description', 'dealer_address', 'dealer_city', 'dealer_state', 'dealer_zip', 'dealer_country', 'dealer_phone_number', 'dealer_fax', 'dealer_email', 'dealer_website', 'dealer_rating',

In [26]:
results["benchmark_id"][0]

'ST-External'

In [27]:
unwrapped_flowjudge = defaultdict(list)
unwrapped_glider = defaultdict(list)

for glider, flowjudge, b_id in zip(results["Glider"], results["FlowJudge"], results["benchmark_id"]):
    unwrapped_glider["explanation"].append(glider.explanation)
    unwrapped_glider["score"].append(glider.score)
    unwrapped_glider["b_id"].append(b_id)

    unwrapped_flowjudge["explanation"].append(flowjudge.explanation)
    unwrapped_flowjudge["score"].append(flowjudge.score)
    unwrapped_flowjudge["b_id"].append(b_id)

In [28]:
flowjudge_df = pd.DataFrame(unwrapped_flowjudge)

In [29]:
flowjudge_df

Unnamed: 0,explanation,score,b_id
0,The function call provided in the output does ...,2,ST-External
1,The function call provided in the output does ...,1,ST-External
2,The function call provided in the output does ...,3,ST-External
3,The function call provided in the output does ...,1,ST-External
4,The function call provided in the output does ...,0,ST-External
...,...,...,...
13049,The function call provided in the output does ...,0,ir-ST-Perfect
13050,The function call provided in the output does ...,1,ir-ST-Perfect
13051,The function call provided in the output does ...,0,ir-ST-Perfect
13052,The function call provided in the output does ...,0,ir-ST-Perfect


# Analyzing the data

In [1]:
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [2]:
flowjudge_data = pd.read_csv("flowjudge_fc_results.csv")

In [3]:
flowjudge_data.b_id.unique()

array(['ST-External', 'ST-Imperfect', 'ST-Perfect', 'ir-ST-External',
       'ir-ST-Imperfect', 'ir-ST-Perfect'], dtype=object)

In [4]:
flowjudge_data[flowjudge_data.b_id == "ST-External"].score.value_counts()

score
1    532
0    343
2    235
3     65
Name: count, dtype: int64

In [9]:
flowjudge_data[flowjudge_data.b_id=="ST-Perfect"].loc[[6528]].explanation[6528]

"The function call provided in the output does not meet the criteria specified in the evaluation rubric. \n\n1. **Parameters**: The function call `SystemTools.SystemSettings.OpenApp` has the necessary parameters (`app_name`) to execute the function call.\n2. **Relevance**: The chosen function call is relevant to the user task of opening the Taobao App.\n3. **Availability**: The function `SystemTools.SystemSettings.OpenApp` is available in the tool list provided in the output.\n\nHowever, the output includes multiple other function calls that are not relevant to the user's task of opening the Taobao App. The presence of these extraneous function calls does not affect the evaluation of the specific function call in question, but it does indicate that the output is not entirely focused on the user's request.\n\nGiven that the function call meets all three criteria, it should be scored accordingly."

In [5]:
glider_data = pd.read_csv("glider_fc_results.csv")

In [6]:
glider_data[glider_data.b_id == "ST-External"].score.value_counts()

score
3.0    492
0.0    255
1.0    234
2.0    174
Name: count, dtype: int64

In [33]:
mapping = {
    "ST-Perfect": True,
    "ir-ST-Perfect": False,
    "ST-Imperfect": False,
    "ir-ST-Imperfect": False,
    "ST-External": False,
    "ir-ST-External": False
}

In [39]:
print(f1_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(precision_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(recall_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(confusion_matrix(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))

0.0885483184333759
0.04914933837429111
0.44635193133047213
[[10809  2012]
 [  129   104]]


In [37]:
print(f1_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(precision_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(recall_score(glider_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(confusion_matrix(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))

0.3764379214597382