This notebook creates three datasets and uploads them to the hf hub:

- `stratified_equitoken_10m_curriculum`
- `stratified_10m_curriculum` 
- `babylm_2024_10m_curriculum`





In [None]:
NUM_PROC_MAP = 150 # expect 30 min with single process
PUSH_TO_HF = True

In [None]:
import os

from dotenv import load_dotenv
load_dotenv()
import datasets
from datasets import DatasetDict
from datasets import load_dataset
import datasets
import torch
import json
from huggingface_hub import HfApi

As in the official [preprocessing repo](https://github.com/babylm/babylm_data_preprocessing), we get the number of words via `line.split()`:

In [None]:
word_count = lambda d: len(d.split())
word_count_dataset = lambda dataset:sum([word_count(d) for d in dataset["text"]])

In [None]:
!mkdir curricula
!mkdir configs

In [None]:
import os.path


In [None]:
def save(dataset, dataset_eval, curriculum, args):
    ds = DatasetDict({
        "train" : dataset,
        "validation" : dataset_eval,
    })
    ds.save_to_disk(args["dataset_folder"],num_proc=NUM_PROC_MAP)

 

    if PUSH_TO_HF:
        ds.push_to_hub(repo_id=args["name"],private=True)
     

def add_source(entry, source,stage):
    entry["source"] = source
    entry["stage"] = stage
    return entry

### Curriculum with 10M 2024 dataset

Extract the `.train` files to a folder named `train_10M`.

In [None]:
args = {
    "dataset_folder": "./train_10M",
   
}
args["name"] = "babylm_2024_10m_curriculum"
args["raw_dataset_folder_babylm"] = "./train_10M"
args["raw_eval_dataset_folder_babylm"] = "./train_100M"
args["dataset_folder"] = "./curricula/datasets/curriculum_10M_2024"
args["epochs_per_stage"] = 2

args["curriculum"] = {
    "C1: Child Directed Speech": ["childes.train"],
    "C2: Unscripted Dialogue": ["switchboard.train","bnc_spoken.train"],
    "C3: Scripted Dialogue": ["open_subtitles.train", ],
    "C4: Wiki": [ "simple_wiki.train"],
    "C5: Written English": ["gutenberg.train"],
}

In [None]:
def create_dataset(curriculum, raw_dataset_folder):
    datasets_stages = []
    for stage, files in curriculum.items():
        d = datasets.concatenate_datasets(
                [
                    load_dataset("text", data_files =os.path.join(raw_dataset_folder, file),download_mode="force_redownload")["train"] 
                    .map(lambda entry: add_source(entry, file, stage),  num_proc=NUM_PROC_MAP)
                    for file in files
                ]
            )
        d = d.shuffle(seed=42) # we shuffle with the stage
        datasets_stages.append(d)
     
    return datasets_stages

In [None]:


        


# pretraining data
torch.manual_seed(0)
datasets_stages = create_dataset(args["curriculum"], args["raw_dataset_folder_babylm"])
dataset = datasets.concatenate_datasets(datasets_stages)
#dataset.save_to_disk(args["dataset_folder"],num_proc=NUM_PROC_MAP)





# eval data is a split of (100M dataset - 10M dataset)
# of size 0.05*len(10M dataset)
torch.manual_seed(0)
eval_datasets_stages, _ = create_dataset(args["curriculum"], args["raw_eval_dataset_folder_babylm"])
dataset_eval = datasets.concatenate_datasets(eval_datasets_stages)
dataset_set = set(dataset["text"]) # to speed up lookup
dataset_eval = dataset_eval.filter(lambda x: x["text"] not in dataset_set) # remove all strings that are in the train dataset 
# do a stratified split, requires casting to class
def copy_source_col(x):
    x["stage_"] = x["stage"]
    return x
dataset_eval = dataset_eval.map(copy_source_col,num_proc=NUM_PROC_MAP)
dataset_eval = dataset_eval.class_encode_column("stage_")
dataset_eval = dataset_eval.train_test_split(test_size=int(len(dataset)*0.05), seed=42, stratify_by_column="stage_",)["test"]
dataset_eval = dataset_eval.remove_columns("stage_")
dataset_eval = dataset_eval.shuffle(seed=42)
dataset_eval = datasets.Dataset.from_dict(dataset_eval.to_dict()) # converting to dict and back to speed up substantially (or substantial slowdown caused by conversions above)

save(dataset, dataset_eval,args)

    


### Stratified 10M Curriculum 

This creates 5 stages of equal size totaling 10M tokens from the raw datasets

In [None]:
!mkdir babylm_data_preprocessing/tmp
!git clone https://github.com/babylm/babylm_data_preprocessing.git
!git clone https://github.com/pgcorpus/gutenberg.git babylm_data_preprocessing/tmp/gutenberg


First, download the raw datasets as described in the official [preprocessing repo](https://github.com/babylm/babylm_data_preprocessing)). Note that the link to `simplewiki` expired, so we use a more recent dump.

In [None]:


# !mkdir babylm_data_preprocessing/preprocessed_data

# !curl https://raw.githubusercontent.com/phueb/BabyBERTa/master/data/corpora/aochildes.txt > babylm_data_preprocessing/preprocessed_data/aochildes.txt
# !curl https://raw.githubusercontent.com/NathanDuran/Switchboard-Corpus/master/swda_data/full_set.txt > babylm_data_preprocessing/preprocessed_data/switchboard.txt
# !curl https://raw.githubusercontent.com/NathanDuran/Switchboard-Corpus/master/swda_data/full_set.txt > babylm_data_preprocessing/preprocessed_data/switchboard.txt

# !curl http://www.thespermwhale.com/jaseweston/babi/CBTest.tgz > babylm_data_preprocessing/preprocessed_data/CBTest.tgz
# !tar -xvzf babylm_data_preprocessing/preprocessed_data/CBTest.tgz -C babylm_data_preprocessing/preprocessed_data
# !mv babylm_data_preprocessing/preprocessed_data/CBTest/data/cbt_*  babylm_data_preprocessing/preprocessed_data/
# !rm -rf babylm_data_preprocessing/preprocessed_data/CBTest babylm_data_preprocessing/preprocessed_data/CBTest.tgz

# !gdown 1nbUCWCAvtqI1-WQxzmyqQmddgsZtzdpR
# !unzip -o children_stories.txt.zip -d babylm_data_preprocessing/preprocessed_data
# !rm children_stories.txt.zip

# !gdown 1vW0o7K6Gj_IYTzriWEjmCnrylCWb8DbY
# !unzip -o open_subtitles.txt.zip -d babylm_data_preprocessing/preprocessed_data
# !rm open_subtitles.txt.zip

# !gdown 19GipY95MW3LrfO_kArmIC0KYy7mfCb1l
# !unzip -o wikipedia.txt.zip -d babylm_data_preprocessing/preprocessed_data
# !rm wikipedia.txt.zip
# !gdown 1R2xWtNeVX48RiFA7vErL1pNtws3XEsYP
# !unzip -o qed.zip -d babylm_data_preprocessing/tmp
# !rm qed.zip

# %run babylm_data_preprocessing/preprocess_qed.py babylm_data_preprocessing/tmp/en babylm_data_preprocessing/tmp/qed
# # !cat babylm_data_preprocessing/tmp/qed/* >> babylm_data_preprocessing/preprocessed_data/qed.txt
# !find babylm_data_preprocessing/tmp/qed/ -type f -exec cat {} + > babylm_data_preprocessing/preprocessed_data/qed.txt
# !rm -rf babylm_data_preprocessing/tmp/qed babylm_data_preprocessing/tmp/en 



# # simplewiki
# !curl https://dumps.wikimedia.org/simplewiki/20241101/simplewiki-20241101-pages-articles.xml.bz2 > babylm_data_preprocessing/tmp/wiki.bz2 # TODO backup
# !bzip2 -d babylm_data_preprocessing/tmp/wiki.bz2
# !python -m wikiextractor.WikiExtractor babylm_data_preprocessing/tmp/wiki -o babylm_data_preprocessing/tmp/wiki_txt

# # https://github.com/babylm/babylm_data_preprocessing/blob/main/preprocess_simple_wiki.py
# # have to change working dir
# import os
# import re

# out_file = open(os.path.join("babylm_data_preprocessing", "preprocessed_data/simple_wiki.txt"), "w")
# wiki_dir = os.path.join("babylm_data_preprocessing","tmp", "wiki_txt")
# for d1 in os.listdir(wiki_dir):
# 	for f in os.listdir(os.path.join(wiki_dir, d1)):
# 		with open(os.path.join(wiki_dir, d1, f)) as input:
# 			title = None
# 			doc = []
# 			for line in input:
# 				if line.startswith("<doc"):
# 					line = next(input)
# 					title = line
# 				elif re.match(r"^\s*$", line):
# 					continue
# 				elif "</doc>" in line:
# 					if len(doc) > 0:
# 						out_file.write(title)
# 						out_file.write("".join(doc))
# 						out_file.write("\n")
# 						doc = []
# 				else:
# 					doc.append(line)
# !rm -rf babylm_data_preprocessing/tmp/wiki babylm_data_preprocessing/tmp/wiki_txt					

# !mkdir babylm_data_preprocessing/tmp/bnc_spoken
# !curl https://llds.ling-phil.ox.ac.uk/llds/xmlui/bitstream/handle/20.500.14106/2554/2554.zip > babylm_data_preprocessing/tmp/bnc_spoken/bnc.zip
# !unzip -q babylm_data_preprocessing/tmp/bnc_spoken/bnc.zip -d babylm_data_preprocessing/tmp/bnc_spoken/
# !(for z in babylm_data_preprocessing/tmp/bnc_spoken/download/Texts/*; do for y in $z/*; do for x in $y/*; do sed '2q;d' $x | grep "^<stext" -q && cp $x babylm_data_preprocessing/tmp/bnc_spoken/; done; done; done)
# %run babylm_data_preprocessing/preprocess_bnc.py babylm_data_preprocessing/tmp/bnc_spoken/ babylm_data_preprocessing/preprocessed_data/bnc_spoken.txt
# !rm -rf babylm_data_preprocessing/tmp/bnc_spoken


# # the get_data.py script ignores the `metadata` param
# %cd babylm_data_preprocessing/tmp/gutenberg 
# !mkdir metadata

# # this can take a day or two...
# %run get_data.py 

# %cd babylm_data_preprocessing/tmp/gutenberg 
# !mkdir metadata
# # the repo contains a tokenizer but for an unspecified version of nltk
# # we download a current one
# # see https://github.com/pgcorpus/gutenberg/issues/5
# import nltk
# nltk.data.path=["src/nltk_data"]
# nltk.download('punkt_tab',download_dir='src/nltk_data')
# %run process_data.py
# %cd ../../..
# # from the babylm preprocessing repo: https://github.com/babylm/babylm_data_preprocessing/blob/main/get_gutenberg_modern_en.py
# import pandas as pd
# import shutil
# import os

# df = pd.read_csv("babylm_data_preprocessing/tmp/gutenberg/metadata/metadata.csv")
# df_modern_en = df[(df["language"] == "['en']") & (df["authoryearofbirth"] > 1850)]
# modern_en_ids = set(df_modern_en["id"])

# os.makedirs("babylm_data_preprocessing/tmp/gutenberg_modern_en")
# for f in os.listdir("babylm_data_preprocessing/tmp/gutenberg/data/text"): 
#     if f.split("_")[0] in modern_en_ids:
#         shutil.copyfile("babylm_data_preprocessing/tmp/gutenberg/data/text/" + f, "babylm_data_preprocessing/tmp/gutenberg_modern_en/" + f)
# !find babylm_data_preprocessing/tmp/gutenberg_modern_en/ -type f -exec cat {} + > babylm_data_preprocessing/preprocessed_data/gutenberg.txt



# you may delete the tmp/gutenberg dir now 
# # !rm -rf babylm_data_preprocessing/tmp/gutenberg
# # !rm -rf metadata 

In [None]:
args["name"] = "stratified_10m_curriculum"

args["epochs_per_stage"] = 10
args["raw_dataset_folder"] = "./babylm_data_preprocessing/preprocessed_data"
args["curriculum"] = {
    "C1: Child Directed Speech" : ["aochildes.txt", ],
    "C2: Children's Books": ["children_stories.txt", "cbt_test.txt", "cbt_train.txt","cbt_valid.txt"],
    "C3: Dialogue": ["switchboard.txt", "bnc_spoken.txt", "open_subtitles.txt"],
    "C4: Educational": ["qed.txt", "simple_wiki.txt"],
    "C5: Written English": ["wikipedia.txt", "gutenberg.txt"]

} 

In [None]:
from datasets import load_dataset,Features,Value

In [None]:

datasets_stages = []
datasets_stages_eval = []


BUDGET_PER_STAGE = 10_000_000 // len(args["curriculum"])
SIZE_EVAL_SPLIT = 0.05

def get_train_test_splits_for_stage(d):
    d = d.shuffle(seed=42)
    words_in_train_split = 0
    i = 0
    wc = word_count(d[i]["text"])
    while (BUDGET_PER_STAGE >= (words_in_train_split + wc)) and (i < len(d)):
        words_in_train_split += wc
        i+=1
        wc = word_count(d[i]["text"])

    words_in_eval_split = 0
    j = i+1
    wc = word_count(d[j]["text"])
    while ((BUDGET_PER_STAGE*SIZE_EVAL_SPLIT) >= (words_in_eval_split + wc)) and (j < len(d)):
        words_in_eval_split += wc
        j+=1
        wc = word_count(d[j]["text"])

    #      pretrain                               eval
    return datasets.Dataset.from_dict(d[0:i]), datasets.Dataset.from_dict(d[i+1:j+1]) 


torch.manual_seed(0)
chunks = { stage: datasets.concatenate_datasets(
            [
                load_dataset("text", data_files =os.path.join(args["raw_dataset_folder"], file),download_mode="force_redownload")["train"] 
                .map(lambda entry: add_source(entry, file, stage), num_proc=NUM_PROC_MAP)
                for file in files
            ]
        ) for stage, files in args["curriculum"].items()}

for name,d in chunks.items():
    print("Processing", name)
    d, d_eval = get_train_test_splits_for_stage(d)
    datasets_stages.append(d)
    datasets_stages_eval.append(d_eval)
    
        
# pretraining data
dataset = datasets.concatenate_datasets(datasets_stages)



# eval data
torch.manual_seed(0)
dataset_eval = datasets.concatenate_datasets(datasets_stages_eval)
dataset_eval = dataset_eval.shuffle(seed=42)



save(dataset, dataset_eval,args)


In [None]:
import os
from datasets import load_dataset, concatenate_datasets

chunks = {}

# Iterate through each stage and its corresponding files in the curriculum
for stage, files in args["curriculum"].items():
    datasets_list = []  # List to store datasets for concatenation
    
    for file in files:
        print(file)
        print(os.path.join(args["raw_dataset_folder"], file))
        # Load dataset from the text file
        dataset = load_dataset(
            "text",
            data_files=os.path.join(args["raw_dataset_folder"], file),
            download_mode="force_redownload"  # Ensures fresh download if needed
        )["train"]  # Access the 'train' split
        print(len(dataset))
        datasets_list.append(dataset)  # Add dataset to the list
    
    # Concatenate all datasets for the current stage
    chunks[stage] = concatenate_datasets(datasets_list)


## Equal Sized Documents version

Create documents with lenght between 300-310 words.

Note that we cannot use a tokenizer trained on extrenal data for the BabyLM challange (or any data not part of our final dataset).

The eval set is made from raw documents/reused from the other curriculum.

In [None]:
args["name"] = "stratified_equitoken_10m_curriculum"

args["epochs_per_stage"] = 10
args["raw_dataset_folder"] = "./babylm_data_preprocessing/preprocessed_data"
args["curriculum"] = {
    "C1: Child Directed Speech" : ["aochildes.txt", ],
    "C2: Children's Books": ["children_stories.txt", "cbt_test.txt", "cbt_train.txt","cbt_valid.txt"],
    "C3: Dialogue": ["switchboard.txt", "bnc_spoken.txt", "open_subtitles.txt"],
    "C4: Educational": ["qed.txt", "simple_wiki.txt"],
    "C5: Written English": ["wikipedia.txt", "gutenberg.txt"]

} 

In [None]:
datasets_stages = []
datasets_stages_eval = []


BUDGET_PER_STAGE = 10_000_000 // len(args["curriculum"])
SIZE_EVAL_SPLIT = 0.05

def get_train_split_for_stage_equitoken(d):
    # d = d.shuffle(seed=42)
    MAX_LENGHT = 100
    MIN_LENGHT = 100

    def extend(doc, new, max_length=MAX_LENGHT):
        result = doc + " " + new
        while word_count(result) > max_length:
            result = result[0:result.rindex(" ")]
        return result

    train = []
    doc = ""
    sources = set()
    words_in_train_split = 0
    i = 0
    while (BUDGET_PER_STAGE >= (words_in_train_split + MIN_LENGHT)) and (i < len(d)):
        doc = extend(doc, d[i]["text"])
        sources.add(d[i]["source"])
        if word_count(doc) >= MIN_LENGHT:
            train.append({"text" : doc, "source" : ", ".join(sources),"stage": d[i]["stage"]})
            words_in_train_split += word_count(doc)
            doc=""
            sources = set()
        i+=1
       
    #      pretrain                         
    return datasets.Dataset.from_list(train).shuffle(seed=42)


torch.manual_seed(0)
chunks = { stage: datasets.concatenate_datasets(
            [
                load_dataset("text", data_files =os.path.join(args["raw_dataset_folder"], file),download_mode="force_redownload")["train"] 
                .map(lambda entry: add_source(entry, file,stage), num_proc=NUM_PROC_MAP)
                for file in files
            ]
        ) for stage, files in args["curriculum"].items()}

for name,d in chunks.items():
    print("Processing", name)
    d = get_train_split_for_stage_equitoken(d)
    datasets_stages.append(d)

 

        
# pretraining data
dataset = datasets.concatenate_datasets(datasets_stages)



# eval data
torch.manual_seed(0)
dataset_eval = load_dataset("loris3/stratified_10m_curriculum")["validation"]


save(dataset, dataset_eval,args)
