In [9]:
import json
from tqdm import tqdm

# Transform dataset to a Axoltol supported format
def convert_to_sharegpt(json_file, output_file, id_prefix, start_id=0):
    with open(json_file, 'r') as file:
        data = json.load(file)

    with open(output_file, 'w') as file:
        for entry in data:
            conversation_id = f"{id_prefix}_{entry['id']+start_id}"
            instruction = entry['instruction']
            response = entry['response']
            conversations = [
                {"from": "human", "value": entry['instruction']},
                {"from": "gpt", "value": entry['response']}
            ]
            gen_input_configs = entry['gen_input_configs']
            gen_input_configs['extract_input'] = entry['extract_input']
            intent = entry['intent']
            knowledge = entry['knowledge']
            difficulty = entry['difficulty']
            difficulty_generator = entry['difficulty_generator']
            input_quality = entry['input_quality']
            quality_explanation = entry['quality_explanation']
            quality_generator = entry['quality_generator']
            task_category = entry['task_category']
            other_task_category = entry['other_task_category']
            task_category_generator = entry['task_category_generator']
            llama_guard_2 = entry['llama_guard_2']
            instruct_reward = entry['instruct_reward']
            reward_model = entry['reward_model']

            if entry['gen_input_configs']['input_generator'] != entry['gen_response_configs']['output_generator']:
                raise ValueError("Input and output generators must be the same")
            
            if id_prefix not in entry['gen_input_configs']['input_generator']:
                raise ValueError(f"Input generator must contain {id_prefix}")

            sharegpt_entry = {
                "conversation_id": conversation_id,
                "instruction": instruction,
                "response": response,
                "conversations": conversations,
                "gen_input_configs": gen_input_configs,
                "intent": intent,
                "knowledge": knowledge,
                "difficulty": difficulty,
                "difficulty_generator": difficulty_generator,
                "input_quality": input_quality,
                "quality_explanation": quality_explanation,
                "quality_generator": quality_generator,
                "task_category": task_category,
                "other_task_category": other_task_category,
                "task_category_generator": task_category_generator,
                "llama_guard_2": llama_guard_2,
                "instruct_reward": instruct_reward,
                "reward_model": reward_model
            }

            file.write(json.dumps(sharegpt_entry) + '\n')
    
    print(f"Converted {len(data)} entries to {output_file}")
    return len(data)

In [10]:
input_files = [
    "Qwen2-72B-Instruct_topp1_temp1_1718659695/Magpie_Qwen2-72B-Instruct_100000_1718659695_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp1_temp1_1718693980/Magpie_Qwen2-72B-Instruct_100000_1718693980_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp1_temp1_1718702029/Magpie_Qwen2-72B-Instruct_100000_1718702029_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp1_temp1.1_1718775995/Magpie_Qwen2-72B-Instruct_100000_1718775995_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp1_temp1.1_1718776098/Magpie_Qwen2-72B-Instruct_100000_1718776098_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp1_temp1.1_1718818690/Magpie_Qwen2-72B-Instruct_100000_1718818690_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp1_temp1.1_1718868461/Magpie_Qwen2-72B-Instruct_100000_1718868461_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp0.99_temp1.2_1718868635/Magpie_Qwen2-72B-Instruct_100000_1718868635_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp0.99_temp1.2_1718897681/Magpie_Qwen2-72B-Instruct_100000_1718897681_ins_res_difficulty_quality_category_safety_reward.json",
    "Qwen2-72B-Instruct_topp0.99_temp1.2_1718950026/Magpie_Qwen2-72B-Instruct_100000_1718950026_ins_res_difficulty_quality_category_safety_reward.json",
]

# Convert each file to Axolotl format
idx = 0
id_prefix = "Qwen2-72B-Instruct"
converted_files = []
for i in tqdm(range(len(input_files))):
    converted_file_name = f"{id_prefix}_sharegpt_shard{i}.jsonl"
    len_data = convert_to_sharegpt(input_files[i], converted_file_name, id_prefix, idx)
    idx += len_data
    converted_files.append(converted_file_name)

# Concatenate all files
output_file = f"{id_prefix}_sharegpt.jsonl"
with open(output_file, 'w') as outfile:
    for fname in tqdm(converted_files):
        with open(fname) as infile:
            for line in infile:
                outfile.write(line)

print(f"Concatenated {len(converted_files)} files to {output_file}")

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:05<00:48,  5.35s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard0.jsonl


 20%|██        | 2/10 [00:10<00:42,  5.32s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard1.jsonl


 30%|███       | 3/10 [00:15<00:36,  5.23s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard2.jsonl


 40%|████      | 4/10 [00:20<00:30,  5.16s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard3.jsonl


 50%|█████     | 5/10 [00:26<00:26,  5.23s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard4.jsonl


 60%|██████    | 6/10 [00:31<00:20,  5.18s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard5.jsonl


 70%|███████   | 7/10 [00:36<00:15,  5.21s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard6.jsonl


 80%|████████  | 8/10 [00:41<00:10,  5.23s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard7.jsonl


 90%|█████████ | 9/10 [00:47<00:05,  5.23s/it]

Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard8.jsonl


100%|██████████| 10/10 [00:52<00:00,  5.23s/it]


Converted 100000 entries to Qwen2-72B-Instruct_sharegpt_shard9.jsonl


100%|██████████| 10/10 [00:07<00:00,  1.27it/s]

Concatenated 10 files to Qwen2-72B-Instruct_sharegpt.jsonl



