# Merge IT datasets

## Czech

In [None]:
IT_DATASETS_PATHS = [
    "datasets/instruction_tuning/bactrian-x/bactrian-x_cs_itformat",
    "datasets/instruction_tuning/muri-it/murit-it_cs_itformat",
    "datasets/instruction_tuning/oasst/oasst2_cs_itformat",
    #"datasets/instruction_tuning/share_gpt/ShareGPT_cs_itformat",
    "datasets/instruction_tuning/ujc_cas/questions_ujc_cas_cs_itformat",
    "datasets/instruction_tuning/ask_library/ask_library_cs_itformat",
]

In [None]:
ID2DATASET = {
    0: "bactrian-x",
    1: "muri-it",
    2: "oasst2",
    #3: "share_gpt",
    3: "questions_ujc_cas",
    4: "ask_library",
}

In [None]:
from datasets import load_from_disk

loaded_datasets = []

for dataset_path in IT_DATASETS_PATHS:
    loaded_datasets.append(load_from_disk(dataset_path))

loaded_datasets

[Dataset({
     features: ['instruction', 'input', 'id', 'output', 'conversations'],
     num_rows: 67017
 }),
 DatasetDict({
     train: Dataset({
         features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name', 'conversations'],
         num_rows: 15875
     })
     validation: Dataset({
         features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name', 'conversations'],
         num_rows: 882
     })
     test: Dataset({
         features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name', 'conversations'],
         num_rows: 881
     })
 }),
 Dataset({
     features: ['id', 'conversations'],
     num_rows: 3
 }),
 Dataset({
     features: ['question', 'specific_question', 'keyword', 'answer', 'last_usage', 'variants', 'id', 'conversations'],
     num_rows: 12363
 }),
 Dataset({
     features: ['title', 'question', 'answer', 'okres', 'library', 'categ

In [None]:
from datasets import Value

final_datasets = []
for i, dataset in enumerate(loaded_datasets):
    if ID2DATASET[i] == "bactrian-x":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.remove_columns(["instruction", "input", "output"])
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda x: {"origin" : f"bactrian-x-{x['original_id'].split("-")[0]}"})
    elif ID2DATASET[i] == "muri-it": 
        tmp_dataset = dataset

        def muri_origin(x):
            if x["subdataset_name"] is None:
                x["origin"] = f"muri-it-{x['dataset_name']}"
            else:
                x["origin"] = f"muri-it-{x['dataset_name']}-{x['subdataset_name']}"
            return x
        
        tmp_dataset = tmp_dataset.map(muri_origin)
        tmp_dataset = tmp_dataset.remove_columns(["input", "output", "language", "split", "language_name", "dataset_name", "subdataset_name"])
        tmp_dataset = tmp_dataset.map(lambda _: {"original_id": "murit-it"})

    elif ID2DATASET[i] == "oasst2":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "oasst2"})

    elif ID2DATASET[i] == "share_gpt":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "share_gpt"})
    elif ID2DATASET[i] == "questions_ujc_cas":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        #convert to str
        tmp_dataset = tmp_dataset.cast_column("original_id", Value("string"))
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "questions_ujc_cas"})
        tmp_dataset = tmp_dataset.remove_columns(["question", "specific_question", "keyword", "answer", "last_usage", "variants"])
        
    elif ID2DATASET[i] == "ask_library":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "ask_library"})
        #convert to str
        tmp_dataset= tmp_dataset.cast_column("original_id", Value("string"))
        tmp_dataset = tmp_dataset.remove_columns(["title", "question", "answer", "okres", "library", "category", "used_sources", "date", "url", "language", 'question_diacritics_korektor', 'answer_diacritics_korektor', 'question_spell_korektor', 'answer_spell_korektor', 'diacritics_correct'])
    
    tmp_dataset = tmp_dataset.filter(lambda x: len(x["conversations"])>0)
    final_datasets.append(tmp_dataset)

Map:   0%|          | 0/15875 [00:00<?, ? examples/s]

Map:   0%|          | 0/882 [00:00<?, ? examples/s]

Map:   0%|          | 0/881 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15875 [00:00<?, ? examples/s]

Filter:   0%|          | 0/882 [00:00<?, ? examples/s]

Filter:   0%|          | 0/881 [00:00<?, ? examples/s]

In [None]:
final_datasets

[Dataset({
     features: ['original_id', 'conversations', 'origin'],
     num_rows: 67017
 }),
 DatasetDict({
     train: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 15875
     })
     validation: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 882
     })
     test: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 881
     })
 }),
 Dataset({
     features: ['original_id', 'conversations', 'origin'],
     num_rows: 3
 }),
 Dataset({
     features: ['original_id', 'conversations', 'origin'],
     num_rows: 12336
 }),
 Dataset({
     features: ['original_id', 'conversations', 'origin'],
     num_rows: 13374
 })]

In [None]:
from datasets import DatasetDict, Dataset, concatenate_datasets

split_datasets = []

for dataset in final_datasets:
    if isinstance(dataset, Dataset):
        if dataset.num_rows < 1000:
            test_size = 0.2
        elif dataset.num_rows < 20000:
            test_size = 0.05
        else:
            test_size = 0.02
        train_test = dataset.train_test_split(test_size=test_size, shuffle=True, seed=42)
        train = train_test["train"]
        test = train_test["test"]
        train_validation = train.train_test_split(test_size=test_size, shuffle=True, seed=42)
        train = train_validation["train"]
        validation = train_validation["test"]
        dataset = DatasetDict({"train": train, "validation": validation, "test": test})
        split_datasets.append(dataset)
    else:
        split_datasets.append(dataset)

split_datasets

[DatasetDict({
     train: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 64362
     })
     validation: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 1314
     })
     test: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 1341
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 15875
     })
     validation: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 882
     })
     test: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 881
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 1
     })
     validation: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 1
     })
  

In [None]:
train_datasets = []
validation_datasets = []
test_datasets = []

for dataset in split_datasets:
    train_datasets.append(dataset["train"])
    validation_datasets.append(dataset["validation"])
    test_datasets.append(dataset["test"])


train = concatenate_datasets(train_datasets)
validation = concatenate_datasets(validation_datasets)
test = concatenate_datasets(test_datasets)

final_dataset = DatasetDict({"train": train, "validation": validation, "test": test})
final_dataset

DatasetDict({
    train: Dataset({
        features: ['original_id', 'conversations', 'origin'],
        num_rows: 103440
    })
    validation: Dataset({
        features: ['original_id', 'conversations', 'origin'],
        num_rows: 3419
    })
    test: Dataset({
        features: ['original_id', 'conversations', 'origin'],
        num_rows: 3509
    })
})

In [None]:
final_dataset["train"].to_pandas()["origin"].value_counts()

origin
bactrian-x-alpaca                                                       49930
bactrian-x-dolly                                                        14432
ask_library                                                             12069
questions_ujc_cas                                                       11133
muri-it-MRI-wikipedia                                                    6793
muri-it-MRI-culturax/mC4                                                 4732
muri-it-WikiHow                                                          2064
muri-it-MRI-culturax/OSCAR-2019                                           743
muri-it-MRI-culturax/OSCAR-2109                                           580
muri-it-MRI-culturax/OSCAR-2301                                           434
muri-it-MRI-culturax/OSCAR-2201                                           218
muri-it-SuperNaturalInstructionv2-task842_para_pdt_cs_en_translation      178
muri-it-SuperNaturalInstructionv2-task1374_newscomm_trans

In [None]:
#create metadata columns signaling how were the instructions and the outputs created based on origin

origin2info = {
    "questions_ujc_cas": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "ask_library": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "oasst2": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "bactrian-x-alpaca": {
        "instruction": "gpt-3.5",
        "instruction_translated": True,
        "output": "gpt-3.5-turbo",
        "output_translated": False
    },
    "bactrian-x-dolly": {
        "instruction": "human",
        "instruction_translated": True,
        "output": "gpt-3.5-turbo",
        "output_translated": False
    },
    "muri-it-MRI-wikipedia": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": True,
        "output": "human",
        "output_translated": False
    },
    "muri-it-MRI-culturax/mC4": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": True,
        "output": "human",
        "output_translated": False
    },
    "muri-it-WikiHow": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "muri-it-MRI-culturax/OSCAR-2019": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": True,
        "output": "human",
        "output_translated": False
    },
     "muri-it-MRI-culturax/OSCAR-2109": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": True,
        "output": "human",
        "output_translated": False
    },
     "muri-it-MRI-culturax/OSCAR-2301": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": True,
        "output": "human",
        "output_translated": False
    },
     "muri-it-MRI-culturax/OSCAR-2201": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": True,
        "output": "human",
        "output_translated": False
    },
    "muri-it-SuperNaturalInstructionv2-task842_para_pdt_cs_en_translation": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False,
    },
    "muri-it-SuperNaturalInstructionv2-task1374_newscomm_translation": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False,
    },
    "muri-it-SuperNaturalInstructionv2-task1370_newscomm_classification": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False,
    }

}

#create map function for hf dataset to create the new columns
def create_info(x):
    x["instruction_type"] = origin2info[x["origin"]]["instruction"]
    x["instruction_translated"] = origin2info[x["origin"]]["instruction_translated"]
    x["output_type"] = origin2info[x["origin"]]["output"]
    x["output_translated"] = origin2info[x["origin"]]["output_translated"]
    return x

final_dataset = final_dataset.map(create_info)


Map:   0%|          | 0/103440 [00:00<?, ? examples/s]

Map:   0%|          | 0/3419 [00:00<?, ? examples/s]

Map:   0%|          | 0/3509 [00:00<?, ? examples/s]

In [None]:
final_dataset["train"][0]

{'original_id': 'alpaca-9861',
 'conversations': [{'content': 'Označte entity ve větě.\n\nBill Gates je zakladatelem společnosti Microsoft.',
   'role': 'user'},
  {'content': '- Bill Gates\n- společnost Microsoft', 'role': 'assistant'}],
 'origin': 'bactrian-x-alpaca',
 'instruction_type': 'gpt-3.5',
 'instruction_translated': True,
 'output_type': 'gpt-3.5-turbo',
 'output_translated': False}

In [None]:
final_dataset.push_to_hub("ctu-aic/cs_instruction_tuning_collection", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/104 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ctu-aic/cs_instruction_tuning_collection/commit/307050a1cb7384ff98126b57270f1392b9d3274d', commit_message='Upload dataset', commit_description='', oid='307050a1cb7384ff98126b57270f1392b9d3274d', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ctu-aic/cs_instruction_tuning_collection', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ctu-aic/cs_instruction_tuning_collection'), pr_revision=None, pr_num=None)

In [None]:
final_dataset.save_to_disk("data/it_datasets/cs_instruction_tuning_collection")

Saving the dataset (0/1 shards):   0%|          | 0/103440 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3419 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/3509 [00:00<?, ? examples/s]

## English

In [1]:
IT_DATASETS_PATHS = [
    "datasets/instruction_tuning/bactrian-x/bactrian-x_en_itformat",
    "datasets/instruction_tuning/muri-it/murit-it_en_itformat",
]

In [2]:
ID2DATASET = {
    0: "bactrian-x",
    1: "muri-it",
}

In [3]:
from datasets import load_from_disk

loaded_datasets = []

for dataset_path in IT_DATASETS_PATHS:
    loaded_datasets.append(load_from_disk(dataset_path))

loaded_datasets

[Dataset({
     features: ['instruction', 'input', 'id', 'output', 'conversations'],
     num_rows: 67017
 }),
 DatasetDict({
     train: Dataset({
         features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name', 'conversations'],
         num_rows: 113395
     })
     validation: Dataset({
         features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name', 'conversations'],
         num_rows: 6300
     })
     test: Dataset({
         features: ['input', 'output', 'dataset_name', 'subdataset_name', 'language', 'split', 'language_name', 'conversations'],
         num_rows: 6300
     })
 })]

In [4]:
from datasets import Value

final_datasets = []
for i, dataset in enumerate(loaded_datasets):
    if ID2DATASET[i] == "bactrian-x":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.remove_columns(["instruction", "input", "output"])
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda x: {"origin" : f"bactrian-x-{x['original_id'].split("-")[0]}"})
    elif ID2DATASET[i] == "muri-it": 
        tmp_dataset = dataset

        def muri_origin(x):
            if x["subdataset_name"] is None:
                x["origin"] = f"muri-it-{x['dataset_name']}"
            else:
                x["origin"] = f"muri-it-{x['dataset_name']}-{x['subdataset_name']}"
            return x
        
        tmp_dataset = tmp_dataset.map(muri_origin)
        tmp_dataset = tmp_dataset.remove_columns(["input", "output", "language", "split", "language_name", "dataset_name", "subdataset_name"])
        tmp_dataset = tmp_dataset.map(lambda _: {"original_id": "murit-it"})

    elif ID2DATASET[i] == "oasst2":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "oasst2"})

    elif ID2DATASET[i] == "share_gpt":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "share_gpt"})
    elif ID2DATASET[i] == "questions_ujc_cas":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        #convert to str
        tmp_dataset = tmp_dataset.cast_column("original_id", Value("string"))
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "questions_ujc_cas"})
        tmp_dataset = tmp_dataset.remove_columns(["question", "specific_question", "keyword", "answer", "last_usage", "variants"])
        
    elif ID2DATASET[i] == "ask_library":
        tmp_dataset = dataset
        tmp_dataset = tmp_dataset.rename_column("id", "original_id")
        tmp_dataset = tmp_dataset.map(lambda _: {"origin" : "ask_library"})
        #convert to str
        tmp_dataset= tmp_dataset.cast_column("original_id", Value("string"))
        tmp_dataset = tmp_dataset.remove_columns(["title", "question", "answer", "okres", "library", "category", "used_sources", "date", "url", "language", 'question_diacritics_korektor', 'answer_diacritics_korektor', 'question_spell_korektor', 'answer_spell_korektor', 'diacritics_correct'])
    
    tmp_dataset = tmp_dataset.filter(lambda x: len(x["conversations"])>0)
    final_datasets.append(tmp_dataset)

In [5]:
final_datasets

[Dataset({
     features: ['original_id', 'conversations', 'origin'],
     num_rows: 67017
 }),
 DatasetDict({
     train: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 113395
     })
     validation: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 6300
     })
     test: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 6300
     })
 })]

In [6]:
#split the 0th dataset into train, validation and test - based on ids in the Czech one

czech_dataset = load_from_disk("data/it_datasets/cs_instruction_tuning_collection")
train_ids = czech_dataset["train"]["original_id"]
validation_ids = czech_dataset["validation"]["original_id"]
test_ids = czech_dataset["test"]["original_id"]

to_split_dataset = final_datasets[0]

train_dataset = to_split_dataset.filter(lambda x: x["original_id"] in train_ids)
validation_dataset = to_split_dataset.filter(lambda x: x["original_id"] in validation_ids)
test_dataset = to_split_dataset.filter(lambda x: x["original_id"] in test_ids)

from datasets import DatasetDict

splitted_dataset = DatasetDict({"train": train_dataset, "validation": validation_dataset, "test": test_dataset})

split_datasets = [splitted_dataset, final_datasets[1]]
split_datasets


[DatasetDict({
     train: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 64362
     })
     validation: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 1314
     })
     test: Dataset({
         features: ['original_id', 'conversations', 'origin'],
         num_rows: 1341
     })
 }),
 DatasetDict({
     train: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 113395
     })
     validation: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 6300
     })
     test: Dataset({
         features: ['conversations', 'origin', 'original_id'],
         num_rows: 6300
     })
 })]

In [7]:
from datasets import concatenate_datasets

train_datasets = []
validation_datasets = []
test_datasets = []

for dataset in split_datasets:
    train_datasets.append(dataset["train"])
    validation_datasets.append(dataset["validation"])
    test_datasets.append(dataset["test"])


train = concatenate_datasets(train_datasets)
validation = concatenate_datasets(validation_datasets)
test = concatenate_datasets(test_datasets)

final_dataset = DatasetDict({"train": train, "validation": validation, "test": test})
final_dataset

DatasetDict({
    train: Dataset({
        features: ['original_id', 'conversations', 'origin'],
        num_rows: 177757
    })
    validation: Dataset({
        features: ['original_id', 'conversations', 'origin'],
        num_rows: 7614
    })
    test: Dataset({
        features: ['original_id', 'conversations', 'origin'],
        num_rows: 7641
    })
})

In [8]:
final_dataset["train"].to_pandas()["origin"].value_counts()

origin
muri-it-flan_v2_tulu               90000
bactrian-x-alpaca                  49930
bactrian-x-dolly                   14432
muri-it-MRI-wikipedia               6657
muri-it-MRI-culturax/mC4            4967
muri-it-xP3                         3600
muri-it-WikiHow                     3163
muri-it-OpenAssistant               3132
muri-it-MRI-culturax/OSCAR-2109      889
muri-it-MRI-culturax/OSCAR-2019      496
muri-it-MRI-culturax/OSCAR-2301      399
muri-it-MRI-culturax/OSCAR-2201       92
Name: count, dtype: int64

In [10]:
#create metadata columns signaling how were the instructions and the outputs created based on origin

origin2info = {
    "muri-it-OpenAssistant": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "bactrian-x-alpaca": {
        "instruction": "gpt-3.5",
        "instruction_translated": False,
        "output": "gpt-3.5-turbo",
        "output_translated": False
    },
    "bactrian-x-dolly": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "gpt-3.5-turbo",
        "output_translated": False
    },
    "muri-it-MRI-wikipedia": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "muri-it-MRI-culturax/mC4": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "muri-it-WikiHow": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "muri-it-MRI-culturax/OSCAR-2019": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
     "muri-it-MRI-culturax/OSCAR-2109": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
     "muri-it-MRI-culturax/OSCAR-2301": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
     "muri-it-MRI-culturax/OSCAR-2201": {
        "instruction": "Mixtral-8x7B",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False
    },
    "muri-it-flan_v2_tulu": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False,
    },
    "muri-it-xP3": {
        "instruction": "human",
        "instruction_translated": False,
        "output": "human",
        "output_translated": False,
    },


}

#create map function for hf dataset to create the new columns
def create_info(x):
    x["instruction_type"] = origin2info[x["origin"]]["instruction"]
    x["instruction_translated"] = origin2info[x["origin"]]["instruction_translated"]
    x["output_type"] = origin2info[x["origin"]]["output"]
    x["output_translated"] = origin2info[x["origin"]]["output_translated"]
    return x

final_dataset = final_dataset.map(create_info)


Map:   0%|          | 0/177757 [00:00<?, ? examples/s]

Map:   0%|          | 0/7614 [00:00<?, ? examples/s]

Map:   0%|          | 0/7641 [00:00<?, ? examples/s]

In [12]:
final_dataset.push_to_hub("ctu-aic/en_instruction_tuning_collection", private=True)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/178 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/8 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/ctu-aic/en_instruction_tuning_collection/commit/26be1a9f770442513519650ea1d0738419544977', commit_message='Upload dataset', commit_description='', oid='26be1a9f770442513519650ea1d0738419544977', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/ctu-aic/en_instruction_tuning_collection', endpoint='https://huggingface.co', repo_type='dataset', repo_id='ctu-aic/en_instruction_tuning_collection'), pr_revision=None, pr_num=None)

In [13]:
final_dataset.save_to_disk("data/it_datasets/en_instruction_tuning_collection")

Saving the dataset (0/1 shards):   0%|          | 0/177757 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7614 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/7641 [00:00<?, ? examples/s]