# Parallel Function Calling Evaluation on Gorilla BFCL Dataset

**🎯 Goal**:
- Run a function calling evaluation in Okareo using the BFCL v3 data.
- Provide a simple introduction to Okareo evaluations.

**📋 Steps**:
1. Upload a function calling scenario using the BFCL data.
2. Define a custom model to generate the function calls
3. Run the evaluation using the scenario (from #1) + model (from #2) along with Okareo predefined checks to measure function call accuracy, including:
    - function_call_ast_validator

**Notes**
- We use two datasets from BFCL:
    - ['simple' data](https://github.com/ShishirPatil/gorilla/blob/main/berkeley-function-call-leaderboard/data/BFCL_v3_live_simple.json) (and [ground truth](https://github.com/ShishirPatil/gorilla/blob/main/berkeley-function-call-leaderboard/data/possible_answer/BFCL_v3_live_simple.json)) - a single expected function call.
    - ['parallel' data](https://github.com/ShishirPatil/gorilla/blob/main/berkeley-function-call-leaderboard/data/BFCL_v3_parallel.json) (and [ground truth](https://github.com/ShishirPatil/gorilla/blob/main/berkeley-function-call-leaderboard/data/possible_answer/BFCL_v3_parallel.json)) - more than one expected function call.

In [1]:
import os

# get Okareo client
from okareo import Okareo

OKAREO_API_KEY = os.environ.get("OKAREO_API_KEY")
okareo = Okareo(OKAREO_API_KEY)

Normally we would use `datasets` from huggingface to download the data, but there was an issue with the column names.

Instead, use the manually downloaded files.

In [2]:
import json

simple_data = ["simple", "./data/BFCL_v3_live_simple.json", "./data/possible_answers/BFCL_v3_live_simple.json"]
parallel_data = ["parallel", "./data/BFCL_v3_parallel.json", "./data/possible_answers/BFCL_v3_parallel.json"]

# make function names safe to use with OpenAI API
def make_function_name_safe(text: str) -> str:
    return text.replace(".", "_")

seed_data_dict = {}

N = 50 # number of examples per file
for seed_data_key, input_filename, result_filename in [simple_data, parallel_data]:

    # read the input and result files and store them in a dictionary
    inputs = {}
    with open(input_filename, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i >= N:
                break
            input_ = json.loads(line)
            # make function names safe to use with OpenAI, which  expects only alphanumeric characters, underscores, or hyphens
            function_list = []
            for f in input_["function"]:
                f['name'] = make_function_name_safe(f['name'])
                function_list.append(f)
            inputs[input_["id"]] = {"question": input_["question"], "function": function_list}

    results = {}
    with open(result_filename, "r") as f:
        lines = f.readlines()
        for i, line in enumerate(lines):
            if i >= N:
                break
            result = json.loads(line)
            function_list = []
            for f in result["ground_truth"]:
                f_safe = {make_function_name_safe(k): v for k, v in f.items()}
                function_list.append(f_safe)
            results[result["id"]] = {"ground_truth": function_list}

    seed_data_dict[seed_data_key] = {
        "inputs": inputs,
        "results": results,
    }

In [3]:
# transform the input/result data into the required scenario format

from okareo_api_client.models.scenario_set_create import ScenarioSetCreate
from okareo_api_client.models.seed_data import SeedData

scenario_dict = {}

for scenario_name, scenario_data in seed_data_dict.items():
    seed_data = []
    inputs = scenario_data["inputs"]
    results = scenario_data["results"]

    for key in inputs.keys():
        input_ = inputs[key]
        result = results[key]
        func_list = []
        for r in result['ground_truth']:
            func_name = list(r.keys())[0]
            func_args = {k: v for k, v in r[func_name].items() if len(v) > 0}
            func_list.append({
                "name": func_name,
                "arguments": func_args,
            })
        seed_data.append(
            SeedData(
                input_ = {
                    "question": input_["question"],
                    "function": input_["function"]
                },
                result = func_list
            )
        )

    tool_scenario = okareo.create_scenario_set(
        ScenarioSetCreate(
            # name=f"BFCL Gorilla - Parallel Function Call",
            name=f"BFCL Gorilla - {scenario_name} Function Call",
            seed_data=seed_data,
        ) 
    )

    print(tool_scenario.app_link)
    scenario_dict[scenario_name] = tool_scenario

http://localhost:3000/project/39a0bd20-1c72-446e-b414-baef90bffe30/scenario/6befafb5-7fb8-46ab-a580-78364b9d1ae3
http://localhost:3000/project/39a0bd20-1c72-446e-b414-baef90bffe30/scenario/b9f33b87-a96e-45f3-914d-a136cae0f681


In [4]:
py_to_oai_types = {
    "str": "string",
    "int": "integer",
    "float": "number",
    "bool": "boolean",
    "list": "array",
    "dict": "object",
}

def parse_input_to_openai_tool(input_function):
    # format the function call to be used in the OpenAI completion 'tools' arg
    # see here: https://platform.openai.com/docs/guides/function-calling
    tools = []
    for tool in input_function:
        params = tool["parameters"]
        parsed_properties = {}
        for p_key, p_dict in params["properties"].items():
            parsed_param = {
                "type": py_to_oai_types[p_dict["type"]] if p_dict["type"] in py_to_oai_types else p_dict["type"],
                "description": p_dict.get("description", "")
            }
            if parsed_param["type"] == "array":
                parsed_param["items"] = {
                    "type": py_to_oai_types[p_dict["items"]["type"]] if p_dict["items"]["type"] in py_to_oai_types else p_dict["items"]["type"],
                }
            if "default" in p_dict:
                parsed_param["default"] = p_dict["default"]
            if "enum" in p_dict:
                parsed_param["enum"] = p_dict["enum"]
            parsed_properties[p_key] = parsed_param
        tools.append({
            "type": "function",
            "function": {
                "name": tool["name"],
                "description": tool["description"],
                "parameters": {
                    "type": py_to_oai_types[params["type"]],
                    "properties": parsed_properties,
                    "required": params["required"],
                }
            }
        })
    return tools

In [5]:
from openai import OpenAI

client = OpenAI()

from okareo.model_under_test import CustomModel, ModelInvocation


class FunctionCallModel(CustomModel):
    def __init__(self, name):
        super().__init__(name)

    def invoke(self, input_value):
        messages = input_value['question'][0]
        tools = parse_input_to_openai_tool(input_value["function"])
        completion = client.chat.completions.create(
            model="gpt-4o",
            messages=messages,
            tools=tools
        )

        # extract the function call from the completion and return in invocation
        completion_tool_calls = []
        if completion.choices[0].message.tool_calls is not None:
            for tc in completion.choices[0].message.tool_calls:
                parsed_tool = tc.to_dict()
                parsed_tool['function']['arguments'] = json.loads(parsed_tool['function']['arguments'])
                completion_tool_calls.append(parsed_tool)
        out = {"tool_calls": completion_tool_calls} 
        return ModelInvocation(
            model_input=messages,
            model_prediction=completion.choices[0].message.content,
            tool_calls=out["tool_calls"]
        )

# Register the model to use in the test run
mut_name="GPT-4o Function Call Model"
model_under_test = okareo.register_model(
    name=mut_name,
    model=[FunctionCallModel(name=FunctionCallModel.__name__)],
    update=True
)

In [6]:
FunctionCallModel(mut_name).invoke(seed_data[0].input_)

ModelInvocation(model_prediction=None, model_input=[{'role': 'user', 'content': 'Play songs from the artists Taylor Swift and Maroon 5, with a play time of 20 minutes and 15 minutes respectively, on Spotify.'}], model_output_metadata=None, tool_calls=[{'id': 'call_kMLW9zlPwyn1PD8AlJ3HApES', 'function': {'arguments': {'artist': 'Taylor Swift', 'duration': 20}, 'name': 'spotify_play'}, 'type': 'function'}, {'id': 'call_KgtCkx7Bp0AkrTpwtumiHOi4', 'function': {'arguments': {'artist': 'Maroon 5', 'duration': 15}, 'name': 'spotify_play'}, 'type': 'function'}])

In [7]:
# evaluation that uses the scenario, check, and model
from okareo_api_client.models.test_run_type import TestRunType

for scenario_name, tool_scenario in scenario_dict.items():
    eval_name = f"BFCL Gorilla - {scenario_name} Function Call Evaluation"
    evaluation = model_under_test.run_test(
        name=eval_name,
        scenario=tool_scenario.scenario_id,
        test_run_type=TestRunType.NL_GENERATION,
        checks=["function_call_ast_validator"]
    )
    print(f"See results for {scenario_name} scenario in Okareo: {evaluation.app_link}")

See results for simple scenario in Okareo: http://localhost:3000/project/39a0bd20-1c72-446e-b414-baef90bffe30/eval/391e6f14-507b-443a-a3fb-75bb5152b76f
See results for parallel scenario in Okareo: http://localhost:3000/project/39a0bd20-1c72-446e-b414-baef90bffe30/eval/a18dab18-a432-43f8-9e4c-f2ba2139b88e
