In [2]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("tiiuae/falcon-180B-chat", use_auth_token=True)
eos_token = tokenizer.eos_token
print(eos_token)



<|endoftext|>


In [11]:
from datasets import load_dataset

"""
We will combine following datasets for following instructions and converse :
1. timdettmers/openassistant-guanaco: 100%
2. GAIR/lima: 100%
3. garage-bAInd/Open-Platypus: 100%
4. Open-Orca/OpenOrca: 10k of GPT4 split
5. ehartford/dolphin: 10k of GPT4 split
6. stingning/ultrachat: 10k
7. jondurbin/airoboros-2.2: 10k while **filtering out** samples with `skip_prompt_formatting==True`
"""


"""
Format of the dataset:

<|system|> system message <|endoftext|> <|prompter|> Q1 <|endoftext|> <|assistant|> A1 <|endoftext|>
"""


system_prompt = """You are an AI assistant that follows instruction extremely well. Help as much as you can."""

print(system_prompt)

You are an AI assistant that follows instruction extremely well. Help as much as you can.


In [12]:
# LIMA dataset processing

def preprocess(samples):
    conv_prefix = f"<|system|> {system_prompt} {eos_token}"
    batch = []
    for sample in samples["conversations"]:
        formatted_conv = conv_prefix
        for i, turn in enumerate(sample):
            turn_prefix = "<|assistant|>" if (i+1)%2==0 else "<|prompter|>"
            formatted_conv += f" {turn_prefix} {turn} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}
            
    


lima = load_dataset("GAIR/lima")
lima = lima.map(
    preprocess,
    batched=True,
    remove_columns=lima["train"].column_names
)

lima["train"] = lima["train"].shuffle(100)

print(lima)
print(lima["train"][0])


Map:   0%|          | 0/1030 [00:00<?, ? examples/s]

Map:   0%|          | 0/300 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 1030
    })
    test: Dataset({
        features: ['content'],
        num_rows: 300
    })
})
{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Are we alone in the universe? <|endoftext|> <|assistant|> Humanity has yet to find evidence for life outside planet Earth.\n\nThe current search for extraterrestrial life is largely focused on finding planets that are situated in an "habitable zone". Roughly put, a planet is in a habitable zone if its distance from nearby stars allows for liquid water to persist on its surface.\nSo far, a few such planets have been found, but none yet with liquid water on its surface. <|endoftext|>'}


In [13]:
# OpenAsst Guanaco dataset processing

tokens = ["### Human:", "### Assistant:"]

import re

def split_on_multiple_tokens(input_string, tokens):
    # Combine the tokens into a regular expression pattern using the '|' (OR) operator
    pattern = '|'.join(re.escape(token) for token in tokens)
    
    # Split the input string using the generated pattern
    split_result = re.split(pattern, input_string)
    
    # Remove any empty strings resulting from consecutive delimiters
    split_result = [part.strip() for part in split_result if part.strip()]
    
    return split_result

def preprocess(samples):
    conv_prefix = f"<|system|> {system_prompt} {eos_token}"
    batch = []
    for sample in samples["text"]:
        sample = split_on_multiple_tokens(sample, tokens)
        formatted_conv = conv_prefix
        for i, turn in enumerate(sample):
            turn_prefix = "<|assistant|>" if (i+1)%2==0 else "<|prompter|>"
            formatted_conv += f" {turn_prefix} {turn} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}
            
    


guanaco = load_dataset("timdettmers/openassistant-guanaco")
guanaco = guanaco.map(
    preprocess,
    batched=True,
    remove_columns=guanaco["train"].column_names
)

guanaco["train"] = guanaco["train"].shuffle()

print(guanaco)
print(guanaco["train"][0])


Downloading readme:   0%|          | 0.00/395 [00:00<?, ?B/s]

Repo card metadata block was not found. Setting CardData to empty.


Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/20.9M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.11M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/9846 [00:00<?, ? examples/s]

Map:   0%|          | 0/518 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 9846
    })
    test: Dataset({
        features: ['content'],
        num_rows: 518
    })
})
{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> En un futuro no muy lejano, la humanidad ha descubierto la forma de crear robots con personalidades únicas y sentimientos. Un día, en un pequeño taller en la ciudad, un grupo de robots decide hacer una fiesta para celebrar su amistad. Mientras preparan la fiesta, discuten sobre qué tipo de comida deben preparar y qué juegos deben jugar. ¿Qué decisión tomarán y cómo se desarrollará la fiesta? ¡Sé creativo y haz que la fiesta sea divertida y llena de risas! <|endoftext|> <|assistant|> Mientras se desarrolla la fiesta, los robots se dan cuenta que el lugar no es muy grande y por ello deciden jugar juegos de mesa, uno de ellos, quien es el más precavido, trae consigo un juego

In [25]:
# Open Playtpus
def preprocess(samples):
    conv_prefix = f"<|system|> {system_prompt} {eos_token}"
    batch = []
    for instruction, input_text, output_text in zip(samples["instruction"], samples["input"], samples["output"]):
        formatted_conv = conv_prefix
        formatted_conv += f" <|prompter|> {instruction} {input_text} {eos_token}" if len(input_text)>0 else f" <|prompter|> {instruction} {eos_token}"
        formatted_conv += f" <|assistant|> {output_text} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}
            
    


platypus = load_dataset("garage-bAInd/Open-Platypus")
platypus = platypus.map(
    preprocess,
    batched=True,
    remove_columns=platypus["train"].column_names
)

platypus["train"] = platypus["train"].shuffle()
platypus = platypus["train"].train_test_split(2500)
print(platypus)
print(platypus["train"][0])

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 22426
    })
    test: Dataset({
        features: ['content'],
        num_rows: 2500
    })
})
{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Which is the smoothest?\nA: asphalt road\nB: rock wall\nC: ceramic mug <|endoftext|> <|assistant|> Smooth is a property. A smooth material is not rough or bumpy.\nLook at each picture, one at a time. Imagine touching the material shown in each picture.\nOf the choices, the ceramic mug is the smoothest. If you touch a piece of ceramic like this one, it will not feel rough. <|endoftext|>'}


In [27]:
# Dolphin

dolphin = load_dataset("ehartford/dolphin",data_files="flan1m-alpaca-uncensored.jsonl")
dolphin = dolphin["train"].shuffle()
dolphin_subset = dolphin.train_test_split(10000)

test = dolphin_subset["train"].train_test_split(5000)
dolphin_subset["train"] = dolphin_subset["test"]
dolphin_subset["test"] = test["test"]


def preprocess(samples):
    batch = []
    for instruction, input_text, output_text in zip(samples["instruction"], samples["input"], samples["output"]):
        conv_prefix = f"<|system|> {instruction} {eos_token}"
        formatted_conv = conv_prefix
        formatted_conv += f" <|prompter|> {input_text} {eos_token}"
        formatted_conv += f" <|assistant|> {output_text} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}

dolphin_subset = dolphin_subset.map(
    preprocess,
    batched=True,
    remove_columns=dolphin_subset["train"].column_names
)


print(dolphin_subset)
print(dolphin_subset["train"][0])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['content'],
        num_rows: 5000
    })
})
{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Please answer the following question: Answer the question based on the following text.  Question:   Jan poured a liquid solution into a bottle and the solution was not dense, so in the bottle it took up more room or less room?    Text:  More dense water takes up less space than less dense water.\nA: <|endoftext|> <|assistant|> In the bottle, the solution took up more room as it was not dense. <|endoftext|>'}


In [30]:
# Open Orca
orca = load_dataset("Open-Orca/OpenOrca",data_files="1M-GPT4-Augmented.parquet")
orca = orca["train"].shuffle()
orca_subset = orca.train_test_split(10000)

test = orca_subset["train"].train_test_split(5000)
orca_subset["train"] = orca_subset["test"]
orca_subset["test"] = test["test"]


def preprocess(samples):
    batch = []
    for instruction, input_text, output_text in zip(samples["system_prompt"], samples["question"], samples["response"]):
        conv_prefix = f"<|system|> {instruction} {eos_token}"
        formatted_conv = conv_prefix
        formatted_conv += f" <|prompter|> {input_text} {eos_token}"
        formatted_conv += f" <|assistant|> {output_text} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}

orca_subset = orca_subset.map(
    preprocess,
    batched=True,
    remove_columns=orca_subset["train"].column_names
)


print(orca_subset)
print(orca_subset["train"][0])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['content'],
        num_rows: 5000
    })
})
{'content': '<|system|> You are an AI assistant. User will you give you a task. Your goal is to complete the task as faithfully as you can. While performing the task think step-by-step and justify your steps. <|endoftext|> <|prompter|> What\'s the most logical way to complete this passage?\n\nBy Beth Stebner PUBLISHED: 14:24 EST, 18 March 2012 | UPDATED: 05:42 EST, 20 March 2012 For the second time during the primaries, Republican presidential hopeful Rick Santorum has made a blunder about Puerto Rico. On the day Puerto Ricans head to the polls to cast their votes for the nation\'s next president, Mr Santorum referred to the archipelago as a ‘Spanish-speaking country.’ In fact, Puerto Rico is not a country at all, but an unincorporated territory of the United States. Last week, the former Pennsylvania senato

In [33]:
# Ultra Chat

def preprocess(samples):
    conv_prefix = f"<|system|> {system_prompt} {eos_token}"
    batch = []
    for sample in samples["data"]:
        formatted_conv = conv_prefix
        for i, turn in enumerate(sample):
            turn_prefix = "<|assistant|>" if (i+1)%2==0 else "<|prompter|>"
            formatted_conv += f" {turn_prefix} {turn} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}

ultrachat = load_dataset("stingning/ultrachat")
ultrachat = ultrachat["train"].shuffle()
ultrachat_subset = ultrachat.train_test_split(10000)

test = ultrachat_subset["train"].train_test_split(5000)
ultrachat_subset["train"] = ultrachat_subset["test"]
ultrachat_subset["test"] = test["test"]

ultrachat_subset = ultrachat_subset.map(
    preprocess,
    batched=True,
    remove_columns=ultrachat_subset["train"].column_names
)


print(ultrachat_subset)
print(ultrachat_subset["train"][0])

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['content'],
        num_rows: 5000
    })
})
{'content': "<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> Are there any markets in Thessaloniki that offer discounts or special deals on certain products? <|endoftext|> <|assistant|> As an AI language model, I don't have access to current information. However, several markets in Thessaloniki may offer discounts or special deals on certain products during seasonal sales or promotional events. Some popular markets in Thessaloniki include Modiano Market, Kapani Market, and the open-air market on Aristotelous Square. Visitors can check with local shops and vendors, look for signs advertising sales, or ask about discounts when making purchases. <|endoftext|> <|prompter|> Can you provide me with some tips on how to bargain for bette

In [46]:
# Airboros 2.2

airboros = load_dataset("jondurbin/airoboros-2.2")
airboros = airboros["train"].shuffle()
airboros = airboros.filter(lambda example: example['skip_prompt_formatting']==False)
airboros_subset = airboros.train_test_split(10000)

test = airboros_subset["train"].train_test_split(5000)
airboros_subset["train"] = airboros_subset["test"]
airboros_subset["test"] = test["test"]
airboros_subset

def preprocess(samples):
    batch = []
    for instruction, input_text, output_text in zip(samples["system"], samples["instruction"], samples["response"]):
        instruction = system_prompt if "A chat." in instruction else instruction
        conv_prefix = f"<|system|> {instruction} {eos_token}"
        formatted_conv = conv_prefix
        formatted_conv += f" <|prompter|> {input_text} {eos_token}"
        formatted_conv += f" <|assistant|> {output_text} {eos_token}"
        batch.append(formatted_conv)
    return {"content": batch}

airboros_subset = airboros_subset.map(
    preprocess,
    batched=True,
    remove_columns=airboros_subset["train"].column_names
)


print(airboros_subset)
print(airboros_subset["train"][0])




Filter:   0%|          | 0/80677 [00:00<?, ? examples/s]

Map:   0%|          | 0/10000 [00:00<?, ? examples/s]

Map:   0%|          | 0/5000 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 10000
    })
    test: Dataset({
        features: ['content'],
        num_rows: 5000
    })
})
{'content': '<|system|> You are an AI assistant that follows instruction extremely well. Help as much as you can. <|endoftext|> <|prompter|> In virology, which of the following statements is true about the Baltimore classification system?\nA. It classifies viruses into seven groups based on their type of genome and method of replication.\nB. It classifies viruses based on their shape and size.\nC. It classifies viruses based on their host organism.\nD. It classifies viruses based on their pathogenicity.\nE. None of the above. <|endoftext|> <|assistant|> A. It classifies viruses into seven groups based on their type of genome and method of replication. <|endoftext|>'}


In [47]:
from datasets import concatenate_datasets, DatasetDict
full_dataset = DatasetDict({split: concatenate_datasets([lima[split], 
                                                         guanaco[split], 
                                                         platypus[split], 
                                                         dolphin_subset[split],
                                                         orca_subset[split],
                                                         ultrachat_subset[split],
                                                         airboros_subset[split], 
                                                        ]) for split in ["train", "test"]})
full_dataset

DatasetDict({
    train: Dataset({
        features: ['content'],
        num_rows: 73302
    })
    test: Dataset({
        features: ['content'],
        num_rows: 23318
    })
})

In [49]:
full_dataset["train"] = full_dataset["train"].shuffle()
full_dataset["test"] = full_dataset["test"].shuffle()

In [50]:
full_dataset.push_to_hub("chat-instruct-mixer", private=False)

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/74 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Pushing dataset shards to the dataset hub:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/24 [00:00<?, ?ba/s]

Deleting unused files from dataset repository:   0%|          | 0/1 [00:00<?, ?it/s]

Downloading metadata:   0%|          | 0.00/587 [00:00<?, ?B/s]