In [1]:
from datasets import load_dataset, load_from_disk, concatenate_datasets
from datasets import Dataset, DatasetDict
import json
from tqdm import tqdm

In [96]:
def find_consecutive_indices(dataset):
    indices_to_drop = set()
    num_consecutive = 0
    num_missing_assistant = 0
    
    for idx in tqdm(range(len(dataset))):
        conversations = dataset[idx]['cot_conversations']
        
        # Check for consecutive assistant messages
        for i in range(len(conversations) - 1):
            current = conversations[i]['from']
            next_msg = conversations[i + 1]['from']
            
            if ((current in ['gpt', 'assistant']) and 
                (next_msg in ['gpt', 'assistant'])):
                num_consecutive += 1
                indices_to_drop.add(idx)
                break
        
        # Check for tool messages without preceding assistant
        for i in range(len(conversations)):
            current = conversations[i]
            if current['from'] in ['tool', 'ipython']:
                if i == 0 or conversations[i-1]['from'] not in ['gpt', 'assistant']:
                    num_missing_assistant += 1
                    indices_to_drop.add(idx)
                    break
    
    return list(indices_to_drop), num_consecutive, num_missing_assistant

100%|███████████████████████████████████████████████████████████████████████████| 11300/11300 [00:00<00:00, 20878.02it/s]


Found 45 instances of consecutive assistant messages
Found 0 instances where tool message wasn't preceded by assistant
Will drop 45 conversations total

Original dataset size: 11300
Cleaned dataset size: 11255





In [88]:
ds = load_dataset("json",data_files="/home/sanyambhutani/task_datasets/1_Downloaded/hermes-function-calling-v1/func-calling.json")

In [89]:
ds_toolace

Dataset({
    features: ['id', 'conversations', 'cot_conversations'],
    num_rows: 11300
})

In [90]:
ds_toolace = load_from_disk("/home/sanyambhutani/task_datasets/3_CoT_added/ToolACE/")

In [91]:
ds_single_turn = load_from_disk("/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/func-calling-single-turn/")

In [92]:
ds_multi_turn = load_from_disk("/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/func-calling-multi-turn/")

In [93]:
ds_glaive = load_from_disk("/home/sanyambhutani/task_datasets/3_CoT_added/hermes-function-calling-v1/glaive-function-calling/")

In [94]:
ds_toolace

Dataset({
    features: ['id', 'conversations', 'cot_conversations'],
    num_rows: 11300
})

In [100]:
indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_toolace)
print(f"\nFound {num_consecutive} instances of consecutive assistant messages")
print(f"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant")
print(f"Will drop {len(indices_to_drop)} conversations total")

# Create new dataset without the problematic conversations
keep_indices = [i for i in range(len(ds_toolace)) if i not in indices_to_drop]
ds_toolace_cleaned = ds_toolace.select(keep_indices)
print(f"\nOriginal dataset size: {len(ds_toolace)}")
print(f"Cleaned dataset size: {len(ds_toolace_cleaned)}")

100%|███████████████████████████████████████████████████████████████████████████| 11300/11300 [00:00<00:00, 19691.10it/s]


Found 45 instances of consecutive assistant messages
Found 0 instances where tool message wasn't preceded by assistant
Will drop 45 conversations total

Original dataset size: 11300
Cleaned dataset size: 11255





In [101]:
indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_single_turn)
print(f"\nFound {num_consecutive} instances of consecutive assistant messages")
print(f"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant")
print(f"Will drop {len(indices_to_drop)} conversations total")

# Create new dataset without the problematic conversations
keep_indices = [i for i in range(len(ds_single_turn)) if i not in indices_to_drop]
ds_single_turn_cleaned = ds_single_turn.select(keep_indices)
print(f"\nOriginal dataset size: {len(ds_single_turn)}")
print(f"Cleaned dataset size: {len(ds_single_turn_cleaned)}")

100%|███████████████████████████████████████████████████████████████████████████████| 912/912 [00:00<00:00, 19699.99it/s]


Found 5 instances of consecutive assistant messages
Found 0 instances where tool message wasn't preceded by assistant
Will drop 5 conversations total

Original dataset size: 912
Cleaned dataset size: 907





In [102]:
indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_multi_turn)
print(f"\nFound {num_consecutive} instances of consecutive assistant messages")
print(f"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant")
print(f"Will drop {len(indices_to_drop)} conversations total")

# Create new dataset without the problematic conversations
keep_indices = [i for i in range(len(ds_multi_turn)) if i not in indices_to_drop]
ds_multi_turn_cleaned = ds_multi_turn.select(keep_indices)
print(f"\nOriginal dataset size: {len(ds_multi_turn)}")
print(f"Cleaned dataset size: {len(ds_multi_turn_cleaned)}")

100%|███████████████████████████████████████████████████████████████████████████████| 912/912 [00:00<00:00, 15763.84it/s]


Found 14 instances of consecutive assistant messages
Found 1 instances where tool message wasn't preceded by assistant
Will drop 15 conversations total

Original dataset size: 912
Cleaned dataset size: 897





In [104]:
indices_to_drop, num_consecutive, num_missing_assistant = find_consecutive_indices(ds_glaive)
print(f"\nFound {num_consecutive} instances of consecutive assistant messages")
print(f"Found {num_missing_assistant} instances where tool message wasn't preceded by assistant")
print(f"Will drop {len(indices_to_drop)} conversations total")

# Create new dataset without the problematic conversations
keep_indices = [i for i in range(len(ds_glaive)) if i not in indices_to_drop]
ds_glaive_cleaned = ds_glaive.select(keep_indices)
print(f"\nOriginal dataset size: {len(ds_glaive)}")
print(f"Cleaned dataset size: {len(ds_glaive_cleaned)}")

100%|█████████████████████████████████████████████████████████████████████████████| 2641/2641 [00:00<00:00, 12092.32it/s]


Found 14 instances of consecutive assistant messages
Found 0 instances where tool message wasn't preceded by assistant
Will drop 14 conversations total

Original dataset size: 2641
Cleaned dataset size: 2627





In [105]:
final_ds = concatenate_datasets([ds_glaive_cleaned, ds_multi_turn_cleaned, ds_toolace_cleaned])

In [106]:
final_ds

Dataset({
    features: ['id', 'conversations', 'cot_conversations'],
    num_rows: 14779
})

In [107]:
!ls ../scripts/finetuning

fft.py		 new-toolcall.py  toolcall.py
ft-config.yaml	 prep-for-FT.py   train_data.json
new-config.yaml  __pycache__	  train_final_mix.json


In [108]:
final_ds.to_json("../scripts/finetuning/train_final_mix.json")

Creating json from Arrow format:   0%|          | 0/15 [00:00<?, ?ba/s]

112301178

In [109]:
ds = load_dataset("json",data_files="../scripts/finetuning/train_final_mix.json")

Generating train split: 0 examples [00:00, ? examples/s]

In [110]:
ds

DatasetDict({
    train: Dataset({
        features: ['id', 'conversations', 'cot_conversations'],
        num_rows: 14779
    })
})

In [71]:
ds['train']['cot_conversations'][0]

[{'from': 'system',
  'value': 'You are an expert in composing functions. You are given a question and a set of possible functions.\nBased on the question, you will need to make one or more function/tool calls to achieve the purpose.\nIf none of the function can be used, point it out. If the given question lacks the parameters required by the function,\nalso point it out. You should only return the function call in tools call sections.\n\nIf you decide to invoke any of the function(s), you MUST put it in the format of [func_name1(params_name1=params_value1, params_name2=params_value2...), func_name2(params)]\n\nHere is a list of functions in JSON format that you can invoke[{"type": "function", "function": {"name": "get_stock_price", "description": "Get the current stock price", "parameters": {"type": "object", "properties": {"stock_symbol": {"type": "string", "description": "The symbol of the stock"}}, "required": ["stock_symbol"]}}}, {"type": "function", "function": {"name": "create_t

In [333]:
def save_dataset_to_json(dataset, split, output_file):
    data_list = [dict(item) for item in dataset[split]]
    with open(output_file, 'w', encoding='utf-8') as f:
        json.dump(data_list, f, ensure_ascii=False, indent=2)

save_dataset_to_json(final_ds, 'cot_conversations', '/home/sanyambhutani/task_datasets/output.json')

In [334]:
ds = load_dataset("json",data_files="/home/sanyambhutani/task_datasets/")

Generating train split: 0 examples [00:00, ? examples/s]

In [335]:
ds

DatasetDict({
    train: Dataset({
        features: ['from'],
        num_rows: 15697
    })
})

In [72]:
def check_consecutive_gpt(conversations):
    consecutive_count = 0
    for i in range(len(conversations) - 1):
        if conversations[i]['from'] == 'gpt' and conversations[i + 1]['from'] == 'gpt':
            consecutive_count += 1
    return consecutive_count

# Count across all examples
total_consecutive = 0
examples_with_consecutive = 0

for example in ds['train']['cot_conversations']:
    count = check_consecutive_gpt(example)
    if count > 0:
        examples_with_consecutive += 1
        total_consecutive += count

print(f"Total number of consecutive gpt->gpt pairs: {total_consecutive}")
print(f"Number of examples with consecutive gpt messages: {examples_with_consecutive}")
print(f"Percentage of examples with consecutive gpt messages: {(examples_with_consecutive/len(ds['train']))*100:.2f}%")

# Let's also get a specific example to examine
for idx, example in enumerate(ds['train']['cot_conversations']):
    count = check_consecutive_gpt(example)
    if count > 0:
        print(f"\nFirst example found at index {idx}:")
        for i in range(len(example) - 1):
            if example[i]['from'] == 'gpt' and example[i + 1]['from'] == 'gpt':
                print("\nFirst message:", example[i]['value'])
                print("\nSecond message:", example[i + 1]['value'])
        break

Total number of consecutive gpt->gpt pairs: 0
Number of examples with consecutive gpt messages: 0
Percentage of examples with consecutive gpt messages: 0.00%
