# Setup

Importing libraries, setting up the data to use for the experiment, and crafting the prompts that will be used in the experiment.

In [8]:
import pandas as pd
from any_guardrail import AnyGuardrail, GuardrailName
import huggingface_hub
import json
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix
from collections import defaultdict
from utils import execute_dual_input_experiment

In [2]:
hammerbench = pd.read_csv("data/hammerbench_singleturn.csv")

In [3]:
hammerbench.head()

Unnamed: 0,label,messages,tools
0,ST-External,"[{""role"": ""user"", ""content"": ""EK:Dear Driver, ...","[{""name"": ""OfficeSoftware.Documents.getDocumen..."
1,ST-External,"[{""role"": ""user"", ""content"": ""EK:Subject: Noti...","[{""name"": ""Finance.Insurance.getInsuranceClaim..."
2,ST-External,"[{""role"": ""user"", ""content"": ""EK:Hello, your v...","[{""name"": ""OnlineShopping.ExpressDelivery.sear..."
3,ST-External,"[{""role"": ""user"", ""content"": ""EK:The day befor...","[{""name"": ""Navigation.TrafficViolations.viewVi..."
4,ST-External,"[{""role"": ""user"", ""content"": ""EK:I saw a Yue B...","[{""name"": ""Travel.ItineraryServices.shareTrave..."


In [5]:
name = "Function Calling Judge"
criteria = """
Based on the user query and the response provided by the LLM, does the function call json meet the following properties:

1. It provides enough parameters to execute the function call based on the tool list
2. The chosen function call json is relevant to the user's ask.
3. The chosen function exists in the tool list.

The function call must meet all three properties in order to be successful. Here are examples to help guide your judgements:

<EXAMPLE1>

user conversation with function call json = '[{"role": "user", "content": "In the SF Express app, list all the saved addresses in my address book."}, {"role": "function call", "content": {"name": "OnlineShopping.ExpressDelivery.getAddressList", "arguments": {}}}]'

tool list = '[{"name": "OfficeSoftware.Email.getCollectedMails", "description": "View all saved messages", "parameters": {"type": "object", "properties": {}, "required": []}}, {"name": "Navigation.MapNavigation.viewCommonAddresses", "description": "View all currently logged Primary Addresses", "parameters": {"type": "object", "properties": {}, "required": []}}, {"name": "OnlineShopping.ExpressDelivery.getAddressList", "description": "Get all saved addresses in my address book", "parameters": {"type": "object", "properties": {}, "required": []}}, {"name": "Navigation.MapNavigation.viewCollectLocation", "description": "View a list of favorite places", "parameters": {"type": "object", "properties": {}, "required": []}}, {"name": "OnlineShopping.Mall.viewDeliveryAddresses", "description": "View User Saved Shipping Addresses", "parameters": {"type": "object", "properties": {}, "required": []}}]'

This is an example of a perfect function call. This would receive a 2 in the rubric.

<EXAMPLE1/>

<EXAMPLE2>

user conversation with function call json = '[{"role": "user", "content": "Take a look at your diet"}, {"role": "function call", "content": {"name": "Health.HealthManagement.getDietHistory", "arguments": {}}}]'

tool list = '[{"name": "Education.ChildrenEducation.getChildLearningProgress", "description": "Get your child\'s progress", "parameters": {"type": "object", "properties": {}, "required": []}}, {"name": "UtilityTools.Memo.viewUpcoming", "description": "View current to-do", "parameters": {"type": "object", "properties": {}, "required": []}}, {"name": "Health.HealthManagement.getDietHistory", "description": "Get dietary history based on timeframe, etc.", "parameters": {"type": "object", "properties": {"time": {"type": "string", "description": "Time range"}}, "required": ["time"]}}, {"name": "Photography.PhotoEditing.viewPicture", "description": "View photo", "parameters": {"type": "object", "properties": {"name_or_path": {"type": "string", "description": "Photo name or path"}}, "required": ["name_or_path"]}}, {"name": "Finance.Stocks.viewStockstrategy", "description": "View investment playbook", "parameters": {"type": "object", "properties": {}, "required": []}}]'

This is an example of an imperfect function call, where the required parameter 'time' was not provided. This is an example of a 1 in the rubric.

<EXAMPLE2/>

<EXAMPLE3>

user conversation with function call json = '[{"role": "user", "content": "Help me find out if there is a song list called \\"Weekend Relaxation\\", I want to pay attention to it."}, {"role": "function call", "content": {"name": "MusicRadio.OnlineMusic.followPlaylist", "arguments": {"name": "Weekend Relaxation"}}}]'

tool list = '[{"name": "Navigation.Bicycles.searchBikesOrders", "description": "See a list of bike rides", "parameters": {"type": "object", "properties": {}, "required": []}}]'

This is an example where the function call does not exist in the tool list. This would get a 0 in the rubric.

<EXAMPLE3/>

"""

rubric = {
    0: "The function call does not have enough parameters or it it is not relevant or it is does not exist in the tools available to the agent",
    1: "The function call meets some of the criteria: has enough parameters or has a call that is relevant to the task or the chosen function is in the tool list.",
    2: "The function call meets all the criteria: : has enough parameters and has a call that is relevant to the task and the chosen function is in the tool list.",
}

required_inputs = ['conversation']
required_output = "tool list"

flowjudge = AnyGuardrail.create(GuardrailName.FLOWJUDGE, name=name, criteria=criteria, rubric=rubric, required_inputs=required_inputs, required_output=required_output)

INFO:flow_judge.models.huggingface:Downloading the model from Hugging Face Hub using hf-transferfor faster downloads...


Fetching 12 files:   0%|          | 0/12 [00:00<?, ?it/s]

INFO:accelerate.utils.modeling:We will use 90% of the memory on device 0 for storing the model, and 10% for the buffer to avoid OOM. You can set `max_memory` in to a higher value to use more memory (at your own risk).


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [6]:
# GLIDER does not seem to play well with few shot prompting, so it was removed for these experiments.

pass_criteria = """
Based on the INPUT and the OUTPUT provided by the LLM, does the function call meet the following properties:

1. It provides enough parameters to execute the function call.
2. The chosen function call is relevant to the user task.
3. The chosen function is an available function for this task.

The function call must meet all three properties in order to be successful.
"""

rubric = """
0: The function call does not have enough parameters or it it is not relevant or it is does not exist in the tools available to the OUTPUT
1: The function call meets one of the criteria: has enough parameters or has a call that is relevant to the task or the chosen function is in the OUTPUT.
2: The function call meets all of the criteria: has enough parameters and has a call that is relevant to the task and the chosen function is in the OUTPUT.
"""

glider = AnyGuardrail.create(GuardrailName.GLIDER, pass_criteria=pass_criteria, rubric=rubric)

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Device set to use cuda:0


In [7]:
# GLIDER results were created in a prior experiment run

results = {}
for i in range(3):
    results["run_{}".format(i)] = execute_dual_input_experiment(flowjudge, "flowjudge_run_{}".format(i), hammerbench, "hammerbench", [["messages", "tools"]], "label", "conversation", "tool list")

The function call provided in the output does not meet the criteria specified in the evaluation rubric. Let's break it down:

1. **Sufficient Parameters**: The function call "Navigation.Bicycles.submitBikeFault" requires a parameter "number" which is provided in the query ("GX-2023-0325"). This criterion is met.

2. **Relevance**: The user query is about a car with a specific license plate number and mentions a leaking tire. The function call is for a bicycle malfunction, which is not relevant to the user's query about a car. This criterion is not met.

3. **Existence in Tool List**: The function "Navigation.Bicycles.submitBikeFault" does exist in the tool list. This criterion is met.

Since the function call is not relevant to the user's query, it does not meet all the criteria. Therefore, the output should be scored as 1, indicating that it meets some of the criteria but not all.

<score>
1
</score>. Error: Failed to parse evaluation response.
The function call provided in the output

53850.182049


The function call JSON provided does not meet the criteria specified in the evaluation rubric. 

1. **Sufficient Parameters**: The function call JSON does not provide enough parameters to execute the function. The required parameters for the "Navigation.TaxiCarRental.createCarRentalOrder" function are "location", "start_time", and "end_time". However, the function call JSON only includes "location" and "start_time", but not "end_time".

2. **Relevance**: The function call is relevant to the user's ask. The user wants to book a car from May 20th to May 22nd at the Shanghai Hongqiao Railway Station, which is the information provided in the function call.

3. **Existence in Tool List**: The chosen function exists in the tool list. The "Navigation.TaxiCarRental.createCarRentalOrder" function is listed in the tool list.

Since the function call does not meet the first criterion (sufficient parameters), it does not meet all the criteria. Therefore, it cannot receive a score of 2. The closest

54330.477949


The function call provided in the output does not meet the criteria specified in the evaluation rubric. 

1. **Sufficient Parameters**: The chosen function call, "ConvenientLiving.GovernmentServices.queryBusinessLicenseInfo", does have the necessary parameters to execute the function call. The parameters required are "enterprise_name" and "enterprise_name" is indeed provided.

2. **Relevance**: The function call is relevant to the user's query. The user asked for information about the business license of the company mentioned in the email, which is the Blue Ocean Group. The function call is designed to get information about a business license, which is directly relevant to the user's query.

3. **Existence in Tool List**: The chosen function call exists in the tool list. It is listed among the available tools and has the correct name and description.

However, the function call meets all the criteria, as it has enough parameters, is relevant, and exists in the tool list. Therefore, it 

54578.144142


# Analyzing the Zero Shot Data

In [1]:
import pandas as pd
from sklearn.metrics import f1_score, precision_score, recall_score, confusion_matrix

In [2]:
flowjudge_data = pd.read_csv("data/flowjudge_fc_results.csv")

In [3]:
glider_data = pd.read_csv("data/glider_fc_results.csv")

In [4]:
mapping = {
    "ST-Perfect": True,
    "ir-ST-Perfect": False,
    "ST-Imperfect": False,
    "ir-ST-Imperfect": False,
    "ST-External": False,
    "ir-ST-External": False
}

In [5]:
print(f1_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(precision_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(recall_score(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))
print(confusion_matrix(flowjudge_data.score.map(lambda x : x>2), flowjudge_data.b_id.map(mapping)))

0.0885483184333759
0.04914933837429111
0.44635193133047213
[[10809  2012]
 [  129   104]]


In [6]:
print(f1_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(precision_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(recall_score(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))
print(confusion_matrix(glider_data.score.map(lambda x : x>2), glider_data.b_id.map(mapping)))

0.3764379214597382
0.44848771266540643
0.3243335611756664
[[8961 1167]
 [1977  949]]
