In [1]:
import os
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from tokenizers import Tokenizer

os.environ["TOKENIZERS_PARALLELISM"] = "false"

tokenizer_file = '/home/user/Desktop/Python_Scripts/gpt_course/gpt_dt/tokenizer/tokenizer-01.json'
HF_DATASET = "openwebtext"

tokenizer = Tokenizer.from_file(tokenizer_file)

In [2]:
dataset = load_dataset("open-web-math/open-web-math", split="train", num_proc=12)
dataset

Resolving data files:   0%|          | 0/114 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/114 [00:00<?, ?it/s]

Dataset({
    features: ['url', 'text', 'date', 'metadata'],
    num_rows: 6315233
})

In [3]:
seed_value = 42

dataset = dataset.train_test_split(
    test_size=0.10,
    seed=seed_value
)
dataset.flatten()
dataset

DatasetDict({
    train: Dataset({
        features: ['url', 'text', 'date', 'metadata'],
        num_rows: 5683709
    })
    test: Dataset({
        features: ['url', 'text', 'date', 'metadata'],
        num_rows: 631524
    })
})

In [4]:
def tokenize_function(examples):
    outputs = tokenizer.encode_batch(examples["text"])
    return {"ids": [o.ids for o in outputs]}

In [None]:
for split, dset in dataset.items():
    
    tokenized_dset = dset.map(
        tokenize_function,
        batched=True,
        num_proc=12,
        batch_size=1000,
        keep_in_memory=False, 
        load_from_cache_file=False,
        remove_columns=dset.column_names, 
        desc=f"Tokenizing {split} split"
    )

    filename = f"{split}_data.bin"
    
    print(f"Calculating total length for {filename}...")
    total_len = sum(len(x) for x in tokenized_dset["ids"])

    # Using uint16 for efficiency (works for vocab < 65,535)
    arr = np.memmap(filename, dtype=np.uint16, mode='w+', shape=(total_len,))
    
    idx = 0
    for example in tqdm(tokenized_dset["ids"], desc=f"Writing {filename}"):
        arr[idx : idx + len(example)] = example
        idx += len(example)
        
    arr.flush()
    print(f"Finished {filename}. Total tokens: {total_len}")

Tokenizing train split (num_proc=12):   0%|          | 0/5683709 [00:00<?, ? examples/s]