In [1]:
from transformers import AutoTokenizer
from datasets import Dataset, load_dataset

dataset_path = "/media/gronkomatic/Embiggen/ai-stuff/datasets/OpenHermes-2.5/openhermes2_5.json"
new_dataset_path = "/media/gronkomatic/Embiggen/ai-stuff/datasets/OpenHermes-2.5-chatML"
model_path = "/media/gronkomatic/Embiggen/ai-stuff/training-results/runs/run-20240315-211134/checkpoint-70180"

In [2]:
tokenizer = AutoTokenizer.from_pretrained(model_path)
tokenizer.chat_template = "{% if not add_generation_prompt is defined %}{% set add_generation_prompt = false %}{% endif %}{% for message in messages %}{{'<|im_start|>' + message['role'] + '\n' + message['content'] + '<|im_end|>' + '\n'}}{% endfor %}{% if add_generation_prompt %}{{ '<|im_start|>assistant\n' }}{% endif %}"

In [10]:
len(tokenizer)

32000

In [None]:
dataset = load_dataset(dataset_path)
dataset

In [5]:
dataset["train"][0]

{'idx': None,
 'topic': None,
 'model_name': None,
 'hash': None,
 'language': None,
 'custom_instruction': None,
 'id': None,
 'conversations': [{'from': 'human',
   'value': 'Every day, a tree drops 7 leaves. How many leaves would it drop in a month of February in a non-leap year? Include your logic.',
   'weight': None},
  {'from': 'gpt',
   'value': "Here's the logic behind this:\n\n1. We know that February has 28 days in a non-leap year.\n2. If the tree drops 7 leaves every day, then over the course of February, it would drop:\n   Leaves dropped in February = Leaves per day * Days in February\n   = 7 leaves * 28 days\n   = 196 leaves\n\nSo, the tree would drop 196 leaves in February in a non-leap year.",
   'weight': None}],
 'source': 'airoboros2.2',
 'system_prompt': None,
 'category': 'orca',
 'skip_prompt_formatting': False,
 'avatarUrl': None,
 'title': None,
 'views': None,
 'model': None}

In [4]:
dataset = load_dataset("json", data_files=dataset_path)
# dataset = dataset.shuffle()
dataset

DatasetDict({
    train: Dataset({
        features: ['idx', 'topic', 'model_name', 'hash', 'language', 'custom_instruction', 'id', 'conversations', 'source', 'system_prompt', 'category', 'skip_prompt_formatting', 'avatarUrl', 'title', 'views', 'model'],
        num_rows: 1001551
    })
})

In [None]:
dataset = dataset["train"].select(range(10))
dataset

In [6]:
# Define a function to rename the keys
def rename_keys(example):
    new_conversations = []
    for conv in example['conversations']:
        if conv["from"] == "gpt":
            conv["from"] = "assistant"
            
        new_conversations.append({
            "role": conv["from"],
            "content": conv["value"]
        })
    return {
        "conversations": new_conversations
    }


# Apply the function to the dataset
renamed_dataset = dataset.map(rename_keys)
renamed_dataset["train"]["conversations"][0]

[{'content': 'Every day, a tree drops 7 leaves. How many leaves would it drop in a month of February in a non-leap year? Include your logic.',
  'role': 'human'},
 {'content': "Here's the logic behind this:\n\n1. We know that February has 28 days in a non-leap year.\n2. If the tree drops 7 leaves every day, then over the course of February, it would drop:\n   Leaves dropped in February = Leaves per day * Days in February\n   = 7 leaves * 28 days\n   = 196 leaves\n\nSo, the tree would drop 196 leaves in February in a non-leap year.",
  'role': 'assistant'}]

In [7]:
dataset = renamed_dataset.map(lambda x: {"text": tokenizer.apply_chat_template(
    x["conversations"], tokenize=False, add_generation_prompt=False)}, remove_columns=["conversations"])
dataset

Map:   0%|          | 0/1001551 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['idx', 'topic', 'model_name', 'hash', 'language', 'custom_instruction', 'id', 'source', 'system_prompt', 'category', 'skip_prompt_formatting', 'avatarUrl', 'title', 'views', 'model', 'text'],
        num_rows: 1001551
    })
})

In [8]:
dataset["train"][0]

{'idx': None,
 'topic': None,
 'model_name': None,
 'hash': None,
 'language': None,
 'custom_instruction': None,
 'id': None,
 'source': 'airoboros2.2',
 'system_prompt': None,
 'category': 'orca',
 'skip_prompt_formatting': False,
 'avatarUrl': None,
 'title': None,
 'views': None,
 'model': None,
 'text': "<|im_start|>human\nEvery day, a tree drops 7 leaves. How many leaves would it drop in a month of February in a non-leap year? Include your logic.<|im_end|>\n<|im_start|>assistant\nHere's the logic behind this:\n\n1. We know that February has 28 days in a non-leap year.\n2. If the tree drops 7 leaves every day, then over the course of February, it would drop:\n   Leaves dropped in February = Leaves per day * Days in February\n   = 7 leaves * 28 days\n   = 196 leaves\n\nSo, the tree would drop 196 leaves in February in a non-leap year.<|im_end|>\n"}

In [9]:
dataset.save_to_disk(new_dataset_path)

Saving the dataset (0/4 shards):   0%|          | 0/1001551 [00:00<?, ? examples/s]