# OpenFunctions Train Dataset Exploration

Quick look at the Gorilla OpenFunctions train set: basic stats, tool distribution, and parameter shapes.

In [1]:
import ast
import sys
from pathlib import Path

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

sys.path.insert(0, str(Path("..").resolve()))

from src.environment import DATASET_PATHS
from src.utils import read_records

sns.set_theme(style='whitegrid')

DATA_PATH = Path(DATASET_PATHS[0])


In [3]:
data = list(read_records(DATA_PATH))


In [4]:
from pprint import pprint

first_funcs = data[0].get('Functions', [])
print('functions_in_first_record:', len(first_funcs))
pprint(first_funcs[0])


functions_in_first_record: 5
("{'name': 'Torch', 'api_name': 'torch.linspace', 'description': 'Create a "
 "one-dimensional tensor with evenly spaced values', 'parameters': {'start': "
 "{'type': 'float', 'description': 'The starting value for the set of "
 "points'}, 'end': {'type': 'float', 'description': 'The ending value for the "
 "set of points'}, 'steps': {'type': 'int', 'description': 'The number of "
 "evenly spaced values to generate'}, 'out': {'type': 'Tensor', 'description': "
 "'Optional output tensor'}, 'dtype': {'type': 'torch.dtype', 'description': "
 "'Optional data type for the computation'}, 'layout': {'type': "
 "'torch.layout', 'description': 'Optional layout of the returned tensor'}, "
 "'device': {'type': 'torch.device', 'description': 'Optional device for the "
 "returned tensor'}, 'requires_grad': {'type': 'bool', 'description': "
 "'Optional flag to enable gradient tracking'}}}\n")


In [5]:
def parse_function(fn_str):
    try:
        fn = ast.literal_eval(fn_str)
    except Exception:
        return None
    name = fn.get('name', '')
    api = fn.get('api_name') or fn.get('api_call') or ''
    params = fn.get('parameters') or {}
    req = params.get('required') or []
    opt = params.get('optional') or []
    if isinstance(params, dict) and not req and not opt:
        req = []
        for k, v in params.items():
            entry = {'name': k}
            if isinstance(v, dict):
                entry.update({kk: vv for kk, vv in v.items()})
            req.append(entry)
    param_total = len(req) + len(opt)
    return {
        'name': name,
        'api': api,
        'description': fn.get('description', ''),
        'param_required': len(req),
        'param_optional': len(opt),
        'param_total': param_total,
    }

records = []
for row in data:
    for fn_str in row.get('Functions', []):
        parsed = parse_function(fn_str)
        if parsed:
            records.append(parsed)
functions_df = pd.DataFrame(records)
functions_df.head()


AttributeError: 'list' object has no attribute 'get'

In [None]:
print('functions parsed:', len(functions_df))
print('unique tools (name, api):', functions_df[['name','api']].drop_duplicates().shape[0])
functions_df[['param_total','param_required','param_optional']].describe()


In [None]:
top_tools = (
    functions_df.groupby(['name','api']).size().sort_values(ascending=False).head(15).reset_index(name='count')
)
plt.figure(figsize=(8,6))
sns.barplot(data=top_tools, y='name', x='count')
plt.title('Top tools by frequency (train set)')
plt.xlabel('Count')
plt.ylabel('Tool name')
plt.tight_layout()
plt.show()


In [None]:
plt.figure(figsize=(8,4))
sns.histplot(functions_df['param_total'], bins=30, kde=True)
plt.title('Distribution of total parameter count per tool entry')
plt.xlabel('param_total')
plt.tight_layout()
plt.show()


In [None]:
instruction_lengths = [len(row.get('Instruction','')) for row in data]
plt.figure(figsize=(8,4))
sns.histplot(instruction_lengths, bins=30, kde=True)
plt.title('Instruction length distribution')
plt.xlabel('Number of characters')
plt.tight_layout()
plt.show()
