In [14]:
!git lfs install

Updated Git hooks.
Git LFS initialized.


In [15]:
!cd hermes-function-calling-v1
!git lfs pull

In [81]:
from datasets import load_dataset, Dataset, DatasetDict
from collections import defaultdict
import pandas as pd
import random
import datasets

In [63]:
def analyze_distribution(data: List[Dict]) -> Tuple[Dict, Dict]:
    """Analyze category and subcategory distribution."""
    category_counts = defaultdict(int)
    subcategory_counts = defaultdict(int)
    category_subcategory_counts = defaultdict(lambda: defaultdict(int))
    
    for item in data:
        category = item['category']
        subcategory = item['subcategory']
        
        category_counts[category] += 1
        subcategory_counts[subcategory] += 1
        category_subcategory_counts[category][subcategory] += 1
    
    return dict(category_counts), dict(subcategory_counts), dict(category_subcategory_counts)


In [93]:
# Load each dataset
agentic = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')
singleturn = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')
func_calling = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')
func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')
glaive_ds = load_dataset('json', data_files='hermes-function-calling-v1/glaive-function-calling-5k.json')

In [64]:
def get_distributions(dataset, name):
    cat_counts = defaultdict(int)
    
    for item in dataset['train']:
        cat = item['category']
        cat_counts[cat] += 1
    
    print(f"\n=== {name} Distribution ===")
    df_cat = pd.DataFrame(list(cat_counts.items()), columns=['Category', 'Count'])
    df_cat['Percentage'] = df_cat['Count'] / df_cat['Count'].sum() * 100
    print(df_cat.sort_values('Count', ascending=False))
    return dict(cat_counts)

In [94]:
print("\nAnalyzing individual datasets:")
agentic_dist = get_distributions(agentic, "Agentic")
singleturn_dist = get_distributions(singleturn, "Singleturn")
func_dist = get_distributions(func_calling, "Function Calling")
func_single_dist = get_distributions(func_single, "Function Calling Singleturn")
glaive_ds_dist = get_distributions(glaive_ds, "Glaive dataset balance")


Analyzing individual datasets:

=== Agentic Distribution ===
                             Category  Count  Percentage
24                        DSPy Agents    457   34.053651
26                         LLM Agents     93    6.929955
8                    Simulacrum Agent     71    5.290611
39                  Instructor Agents     68    5.067064
23                   Agent Frameworks     64    4.769001
11                        JSON Schema     59    4.396423
0                     Simulacra Agent     59    4.396423
10                 Copilot Frameworks     47    3.502235
37                     Autogen Agents     45    3.353204
41                  LlamaIndex Agents     44    3.278689
33                   Langchain Agents     40    2.980626
1                      Embodied Agent     38    2.831595
29            Voyager MineCraft Agent     36    2.682563
32                     Copilot Agents     31    2.309985
35                   Guardrails Agent     28    2.086438
18                   WebBr

## lets start with agentic

In [68]:
# Category mapping for merging
CATEGORY_MAPPING = {
    # Simulacra merging
    'Simulacrum Agent': 'Simulacra Agents',
    'Simulacra Agent': 'Simulacra Agents',
    
    # Outlines merging
    'Outlines Agents': 'Outlines Agents',
    'Outlines Agent': 'Outlines Agents',
    
    # Minecraft merging
    'Minecraft Agent': 'Minecraft Agents',
    'Voyager MineCraft Agent': 'Minecraft Agents',
    
    # Framework merging
    'Agent Frameworks': 'Development Frameworks',
    'Copilot Frameworks': 'Development Frameworks',
    
    # Utility agents merging
    'AI Analysis Agent': 'Utility Agents',
    'Code Analysis Agent': 'Utility Agents',
    'File Management Agent': 'Utility Agents',
    'Utility Function': 'Utility Agents',
    'WebBrowser Agent': 'Utility Agents',
    
    # Data processing merging
    'Data Structures': 'Data Processing Agents',
    'Data Structure': 'Data Processing Agents',
    'Data Compression': 'Data Processing Agents',
    
    # Keep major categories as is
    'DSPy Agents': 'DSPy Agents',
    'LLM Agents': 'LLM Agents',
    'Instructor Agents': 'Instructor Agents',
    'Autogen Agents': 'Autogen Agents',
    'LlamaIndex Agents': 'LlamaIndex Agents',
    'Langchain Agents': 'Langchain Agents',
}

# Default category for any not explicitly mapped
DEFAULT_CATEGORY = 'Other Agents'

def balance_agentic_dataset(target_size=25):
    # Load dataset
    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')
    
    # Group examples by mapped categories
    category_groups = defaultdict(list)
    for item in dataset['train']:
        original_category = item['category']
        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)
        category_groups[mapped_category].append(item)
    
    # Print original distribution after mapping
    print("\nOriginal distribution after category mapping:")
    for cat, items in category_groups.items():
        print(f"{cat}: {len(items)}")
    
    # Balance dataset
    balanced_data = []
    for category, items in category_groups.items():
        if len(items) > target_size:
            # Randomly sample target_size items
            sampled_items = random.sample(items, target_size)
            balanced_data.extend(sampled_items)
        else:
            # Keep all items if less than target_size
            balanced_data.extend(items)

In [69]:
DEFAULT_CATEGORY = 'Other Agents'


In [70]:
def balance_agentic_dataset(target_size=25):
    # Load dataset
    dataset = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-agentic.json')
    
    # Group examples by mapped categories
    category_groups = defaultdict(list)
    for item in dataset['train']:
        original_category = item['category']
        mapped_category = CATEGORY_MAPPING.get(original_category, DEFAULT_CATEGORY)
        category_groups[mapped_category].append(item)
    
    # Print original distribution after mapping
    print("\nOriginal distribution after category mapping:")
    for cat, items in category_groups.items():
        print(f"{cat}: {len(items)}")
    
    # Balance dataset
    balanced_data = []
    for category, items in category_groups.items():
        if len(items) > target_size:
            # Randomly sample target_size items
            sampled_items = random.sample(items, target_size)
            balanced_data.extend(sampled_items)
        else:
            # Keep all items if less than target_size
            balanced_data.extend(items)
    
    # Print final distribution
    final_distribution = defaultdict(int)
    for item in balanced_data:
        mapped_category = CATEGORY_MAPPING.get(item['category'], DEFAULT_CATEGORY)
        final_distribution[mapped_category] += 1
    
    print("\nFinal distribution:")
    df_final = pd.DataFrame(list(final_distribution.items()), 
                          columns=['Category', 'Count'])
    df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100
    print(df_final.sort_values('Count', ascending=False))
    
    print(f"\nOriginal dataset size: {len(dataset['train'])}")
    print(f"Balanced dataset size: {len(balanced_data)}")
    
    return balanced_data


In [71]:
balanced_data = balance_agentic_dataset(25)


Original distribution after category mapping:
Simulacra Agents: 130
Other Agents: 241
Data Processing Agents: 3
Utility Agents: 30
Development Frameworks: 111
Minecraft Agents: 54
Outlines Agents: 26
DSPy Agents: 457
LLM Agents: 93
Langchain Agents: 40
Autogen Agents: 45
Instructor Agents: 68
LlamaIndex Agents: 44

Final distribution:
                  Category  Count  Percentage
0         Simulacra Agents     25    8.250825
1             Other Agents     25    8.250825
3           Utility Agents     25    8.250825
4   Development Frameworks     25    8.250825
5         Minecraft Agents     25    8.250825
9         Langchain Agents     25    8.250825
6          Outlines Agents     25    8.250825
7              DSPy Agents     25    8.250825
8               LLM Agents     25    8.250825
11       Instructor Agents     25    8.250825
10          Autogen Agents     25    8.250825
12       LlamaIndex Agents     25    8.250825
2   Data Processing Agents      3    0.990099

Original dataset 

In [82]:
balanced_dataset = Dataset.from_list(balanced_data)

In [83]:
dataset_dict = DatasetDict({
        'train': balanced_dataset
    })

In [84]:
dataset_dict.save_to_disk("balanced-json-modeagentic")

Saving the dataset (0/1 shards):   0%|          | 0/303 [00:00<?, ? examples/s]

In [86]:
dataset_dict['train']

Dataset({
    features: ['id', 'conversations', 'category', 'subcategory', 'schema'],
    num_rows: 303
})

## Func-Calling

In [87]:
def balance_func_dataset(target_size=25):
    # Load dataset
    dataset = load_dataset('json', data_files='hermes-function-calling-v1/func-calling.json')
    
    # Group examples by category
    category_groups = defaultdict(list)
    for item in dataset['train']:
        category_groups[item['category']].append(item)
    
    # Print original distribution
    print("\nOriginal distribution:")
    for cat, items in category_groups.items():
        print(f"{cat}: {len(items)}")
    
    # Balance dataset - cap at target_size but keep smaller categories as is
    balanced_data = []
    for category, items in category_groups.items():
        if len(items) > target_size:
            # Randomly sample target_size items
            sampled_items = random.sample(items, target_size)
            balanced_data.extend(sampled_items)
        else:
            # Keep all items if less than target_size
            balanced_data.extend(items)
    
    # Print final distribution
    final_distribution = defaultdict(int)
    for item in balanced_data:
        final_distribution[item['category']] += 1
    
    print("\nFinal distribution:")
    df_final = pd.DataFrame(list(final_distribution.items()), 
                          columns=['Category', 'Count'])
    df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100
    print(df_final.sort_values('Count', ascending=False))
    
    print(f"\nOriginal dataset size: {len(dataset['train'])}")
    print(f"Balanced dataset size: {len(balanced_data)}")
    
    return balanced_data

In [88]:
def save_as_hf_dataset(balanced_data, output_path='balanced_func_calling'):
    """Save the balanced dataset as a Hugging Face dataset."""
    # Convert to Dataset format
    balanced_dataset = Dataset.from_list(balanced_data)
    
    # Create DatasetDict with train split
    dataset_dict = DatasetDict({
        'train': balanced_dataset
    })
    
    # Save dataset
    dataset_dict.save_to_disk(output_path)
    print(f"\nSaved balanced dataset to {output_path}")
    return dataset_dict

# Run the balancing and save
balanced_data = balance_func_dataset(25)
dataset_dict = save_as_hf_dataset(balanced_data)


Original distribution:
IoT and Home Automation: 15
Quantum Computing: 8
Services Industry Software: 17
IoT Platforms: 8
Communication Services Software: 37
Materials Software: 9
E-commerce Platforms: 23
Blockchain Integration: 5
Data Centers and High Performance Computing: 5
Information Extraction: 796
Data Privacy: 10
Annotation: 4
Model APIs: 23
Project Management: 3
Financial Services Apps: 29
Decentralized Apps (DApps): 3
Use Apps: 37
Office Administration: 4
Document Ranking: 1
CRM: 3
Marketing: 1
Code Interpreters: 9
Algorithmic Trading: 39
Energy Software: 9
Utilities Software: 52
Information Technology Software: 35
Data Processing: 19
Database Management: 10
Operating System Functions: 14
Text Classification: 3
Robotics and Automation: 8
Networking and Cybersecurity: 6
Consumer Discretionary Software: 25
Named Entity Recognition: 8
Information Retrieval (RAG): 20
Productivity Tools Integration: 8
Web APIs: 9
Low-Code Enterprise Platforms: 19
Robotic Process Automation (RPA): 5

Saving the dataset (0/1 shards):   0%|          | 0/819 [00:00<?, ? examples/s]


Saved balanced dataset to balanced_func_calling


## Merge both Single-Turn

In [91]:
from datasets import load_dataset, Dataset, DatasetDict
import random
from collections import defaultdict
import pandas as pd
import math

def downsample_and_tag_dataset(dataset, source_name, target_total=150):
    # Group examples by category
    category_groups = defaultdict(list)
    for item in dataset['train']:
        category_groups[item['category']].append(item)
    
    num_categories = len(category_groups)
    # Calculate samples per category to achieve target total
    samples_per_category = max(1, math.floor(target_total / num_categories))
    
    print(f"\n{source_name}:")
    print(f"Number of categories: {num_categories}")
    print(f"Samples per category: {samples_per_category}")
    
    # Balance dataset
    balanced_data = []
    for category, items in category_groups.items():
        if len(items) > samples_per_category:
            sampled_items = random.sample(items, samples_per_category)
            balanced_data.extend(sampled_items)
        else:
            # For categories with fewer examples than target, keep all
            balanced_data.extend(items)
    
    # Add source tag to each example
    for item in balanced_data:
        item['dataset_source'] = source_name
    
    # Print distribution
    final_distribution = defaultdict(int)
    for item in balanced_data:
        final_distribution[item['category']] += 1
    
    print(f"\nDistribution for {source_name}:")
    df_final = pd.DataFrame(list(final_distribution.items()), 
                          columns=['Category', 'Count'])
    df_final['Percentage'] = df_final['Count'] / len(balanced_data) * 100
    print(df_final.sort_values('Count', ascending=False))
    
    print(f"\n{source_name} final size: {len(balanced_data)}")
    return balanced_data

def merge_and_save_datasets(target_per_dataset=150):
    # Load datasets
    func_single = load_dataset('json', data_files='hermes-function-calling-v1/func-calling-singleturn.json')
    json_single = load_dataset('json', data_files='hermes-function-calling-v1/json-mode-singleturn.json')
    
    # Print original sizes
    print(f"Original func_single size: {len(func_single['train'])}")
    print(f"Original json_single size: {len(json_single['train'])}")
    
    # Downsample and tag each dataset
    func_balanced = downsample_and_tag_dataset(func_single, 'func_calling_singleturn', target_per_dataset)
    json_balanced = downsample_and_tag_dataset(json_single, 'json_mode_singleturn', target_per_dataset)
    
    # Merge datasets
    merged_data = func_balanced + json_balanced
    
    # Create and save merged dataset
    merged_dataset = Dataset.from_list(merged_data)
    dataset_dict = DatasetDict({
        'train': merged_dataset
    })
    
    # Print final statistics
    print("\nFinal merged dataset statistics:")
    print(f"Total examples: {len(merged_data)}")
    print(f"From func_calling_singleturn: {len(func_balanced)}")
    print(f"From json_mode_singleturn: {len(json_balanced)}")
    
    # Save dataset
    output_path = 'balanced_singleturn_merged'
    dataset_dict.save_to_disk(output_path)
    print(f"\nSaved merged dataset to {output_path}")
    
    return dataset_dict

# Run the merging process
merged_dataset = merge_and_save_datasets(150)

Original func_single size: 1893
Original json_single size: 1241

func_calling_singleturn:
Number of categories: 63
Samples per category: 2

Distribution for func_calling_singleturn:
                                Category  Count  Percentage
0                IoT and Home Automation      2    1.639344
1                      Quantum Computing      2    1.639344
2             Services Industry Software      2    1.639344
3                          IoT Platforms      2    1.639344
4        Communication Services Software      2    1.639344
..                                   ...    ...         ...
58  Identity and Access Management (IAM)      2    1.639344
18                      Document Ranking      1    0.819672
20                             Marketing      1    0.819672
45                 Business Intelligence      1    0.819672
50                       Topic Modelling      1    0.819672

[63 rows x 3 columns]

func_calling_singleturn final size: 122

json_mode_singleturn:
Number of c

Saving the dataset (0/1 shards):   0%|          | 0/260 [00:00<?, ? examples/s]


Saved merged dataset to balanced_singleturn_merged


In [92]:
merged_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'conversations', 'category', 'subcategory', 'task', 'dataset_source'],
        num_rows: 260
    })
})

## Glaive

In [95]:
def create_category_mapping():
    """Create mapping from original categories to new merged categories."""
    return {
        # Technology & Computing
        'Technology': 'tech_computing',
        'Programming Concepts': 'tech_computing',
        'Programming and Computer Science Questions': 'tech_computing',
        'Web Development and Design': 'tech_computing',
        'Database and SQL': 'tech_computing',
        'Swift Programming': 'tech_computing',
        'Cybersecurity and Encryption': 'tech_computing',
        
        # Data & Analytics
        'Data Science': 'data_analytics',
        'Data Analysis and Programming': 'data_analytics',
        'Machine Learning': 'data_analytics',
        'Natural Language Processing': 'data_analytics',
        
        # Finance & Business
        'Stocks and Orders': 'finance_business',
        'Loan and Financial Calculations': 'finance_business',
        'Finance & Economics': 'finance_business',
        'Business Strategies': 'finance_business',
        
        # Science & Education
        'Science Education': 'science_education',
        'Science and Nature Exploration': 'science_education',
        'Quantum Physics': 'science_education',
        'Climate and Environmental Solutions': 'science_education',
        
        # Services & Productivity
        'Flight Services': 'services_productivity',
        'Location Services': 'services_productivity',
        'Productivity': 'services_productivity',
        'Request Management': 'services_productivity',
        
        # Knowledge & Culture
        'History and Culture': 'knowledge_culture',
        'Book Search': 'knowledge_culture',
        'Literary Analysis': 'knowledge_culture',
        'Language and Linguistics': 'knowledge_culture',
        'Language and Logic': 'knowledge_culture'
    }

def balance_dataset(dataset: List[Dict], target_size: int = 500) -> List[Dict]:
    """
    Balance the dataset by:
    1. Merging categories
    2. Downsampling to achieve balanced distribution
    3. Ensuring total dataset size meets target
    """
    category_mapping = create_category_mapping()
    
    # Group data by new categories
    new_categories = defaultdict(list)
    for item in dataset:
        if item['category'] in category_mapping:
            new_cat = category_mapping[item['category']]
            new_categories[new_cat].append(item)
    
    # Calculate target size per category
    n_categories = len(new_categories)
    target_per_category = target_size // n_categories
    
    # Balance categories
    balanced_data = []
    for category, items in new_categories.items():
        # Downsample if necessary
        if len(items) > target_per_category:
            sampled_items = random.sample(items, target_per_category)
        else:
            # If we have fewer items than target, use all of them
            sampled_items = items
            
        # Update category name in items
        for item in sampled_items:
            item['category'] = category
            balanced_data.append(item)
    
    return balanced_data

def analyze_balanced_dataset(balanced_data: List[Dict]):
    """Print distribution of balanced dataset."""
    category_counts = defaultdict(int)
    for item in balanced_data:
        category_counts[item['category']] += 1
    
    print("\n=== Balanced Dataset Distribution ===")
    for category, count in sorted(category_counts.items()):
        print(f"{category}: {count}")
    print(f"\nTotal samples: {sum(category_counts.values())}")

In [96]:
balanced_data = balance_dataset(glaive_ds['train'])

In [97]:
analyze_balanced_dataset(balanced_data)


=== Balanced Dataset Distribution ===
data_analytics: 83
finance_business: 83
knowledge_culture: 83
science_education: 83
services_productivity: 83
tech_computing: 83

Total samples: 498


In [99]:
balanced_data[24]

{'id': '15d547ad-bc66-4b75-8d07-3258b424c023',
 'conversations': [{'from': 'system',
   'value': "You are a function calling AI model. You are provided with function signatures within <tools></tools> XML tags.You may call one or more functions to assist with the user query. Don't make assumptions about what values to plug into functions.Here are the available tools:<tools> [{'type': 'function', 'function': {'name': 'get_stock_price', 'description': 'Get the current stock price of a company', 'parameters': {'type': 'object', 'properties': {'company': {'type': 'string', 'description': 'The name of the company'}, 'symbol': {'type': 'string', 'description': 'The stock symbol of the company'}}, 'required': ['company', 'symbol']}}}, {'type': 'function', 'function': {'name': 'search_movies', 'description': 'Search for movies based on title or genre', 'parameters': {'type': 'object', 'properties': {'title': {'type': 'string', 'description': 'The title of the movie'}, 'genre': {'type': 'string'

In [100]:
balanced_dataset = datasets.Dataset.from_list(balanced_data)

In [101]:
dataset_dict = datasets.DatasetDict({
        'train': balanced_dataset
    })

In [103]:
dataset_dict.save_to_disk("glaive-balanced")

Saving the dataset (0/1 shards):   0%|          | 0/498 [00:00<?, ? examples/s]

In [104]:
#fin

In [105]:
!ls

architecture_diagram.png    hermes-function-calling-v1
balanced_func_calling	    llama-official-user-prompt-3.1-with-CoT
balanced-json-modeagentic   llama-user-prompt
balanced_singleturn_merged  llama-user-prompt-3.1
bfcl			    LOG_GUIDE.md
bfcl.egg-info		    Notebooks
CHANGELOG.md		    openfunctions_evaluation.py
CONTRIBUTING.md		    Pre-Process-Downsampled-Nous.ipynb
conversation_patterns.csv   pyproject.toml
CoT-ToolAce.ipynb	    qwen-re-run
data			    README.md
data_live.csv		    re-run-70B-FC
data_multi_turn.csv	    re-score
data_non_live.csv	    result
data_overall.csv	    score
dataset_info.json	    state.json
Detailed-EDA-ToolACE.ipynb  SUPPORTED_MODELS.md
Detailed-EDA-XLAM.ipynb     test_case_ids_to_generate.json
Down-sample-Nous.ipynb	    TEST_CATEGORIES.md
ft-data			    transformed_toolace
glaive-balanced		    utils
