In [1]:
import os
import time
import datasets
import tiktoken
import numpy as np
from datasets import load_dataset

In [2]:
dataset_hf_path = "HuggingFaceFW/fineweb-edu"
dataset_hf_name = "sample-10BT"
dataset_hf_split = "train"
data_save_folder = "../data/fineweb-edu-sample-10BT"

dataset_hf_names = datasets.get_dataset_config_names(dataset_hf_path)
print(f"Dataset configs: {dataset_hf_names[:5]}")
assert dataset_hf_name in dataset_hf_names

dataset_hf_splits = datasets.get_dataset_split_names(dataset_hf_path, dataset_hf_name)
print(f"Dataset splits: {dataset_hf_splits}")
assert dataset_hf_split in dataset_hf_splits

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

Dataset configs: ['default', 'sample-10BT', 'sample-100BT', 'sample-350BT', 'CC-MAIN-2025-05']


Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

Dataset splits: ['train']


In [3]:
os.makedirs(data_save_folder, exist_ok=True)

In [4]:
fw = load_dataset(
    dataset_hf_path,
    name=dataset_hf_name,
    split=dataset_hf_split,
    # streaming=True,
)

Resolving data files:   0%|          | 0/2410 [00:00<?, ?it/s]

sample/10BT/000_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/001_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/002_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/003_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/004_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/005_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/006_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/007_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/008_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/009_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/010_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/011_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/012_00000.parquet:   0%|          | 0.00/2.15G [00:00<?, ?B/s]

sample/10BT/013_00000.parquet:   0%|          | 0.00/541M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/9672101 [00:00<?, ? examples/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

In [5]:
tokenizer = tiktoken.get_encoding("gpt2")
eot = tokenizer._special_tokens['<|endoftext|>']

In [6]:
def save_buff(tokens_shard, shard_idx):
    tokens_np = np.array(tokens_shard, dtype=np.int32)
    assert (0 <= tokens_np).all() and (tokens_np < 2**16).all()
    tokens_np = tokens_np.astype(np.uint16)

    split = 'val' if shard_idx == 0 else 'train'
    data_filename = f"edufineweb_{split}_{shard_idx:06d}.npy"
    data_filepath = os.path.join(data_save_folder, data_filename)
    np.save(data_filepath, tokens_np)
    print(f"Saved {data_filepath} with shape {tokens_np.shape} and dtype {tokens_np.dtype}")

In [7]:
# iterate through the dataset
tokens_running_buff = []
shard_idx = 0
shard_size = 100_000_000
total_num_tokens = 0
total_expected_tokens = 10_000_000_000  # 10 billion tokens
time_start = time.time()
ts = time.time()
for i, item in enumerate(fw):
    # print(item.keys())
    text = item['text']
    tokens = [eot] + tokenizer.encode_ordinary(text)
    tokens_running_buff.extend(tokens)
    total_num_tokens += len(tokens)
    
    # Print Periodically
    tnow = time.time()
    dt = tnow - ts
    if dt > 15:
        ts = time.time()
        
        tok_per_sec = total_num_tokens / (tnow - time_start)
        remaining_tokens = total_expected_tokens - total_num_tokens
        print(f"Processed {total_num_tokens:,} / {total_expected_tokens:,} tokens so far, {tok_per_sec:,.0f} tok/sec, ETA: {remaining_tokens / tok_per_sec / 3600:.2f} hours")

    if len(tokens_running_buff) >= shard_size:
        # TODO: save here
        tokens_shard = tokens_running_buff[:shard_size]
        tokens_running_buff = tokens_running_buff[shard_size:]
        
        save_buff(tokens_shard, shard_idx)

        shard_idx += 1

    # if time.time() - time_start > 64:
    #     break
# Save any remaining tokens
if len(tokens_running_buff) > 0:
    save_buff(tokens_running_buff, shard_idx)

Processed 34,945,541 / 10,000,000,000 tokens so far, 2,329,667 tok/sec, ETA: 1.19 hours
Processed 67,498,162 / 10,000,000,000 tokens so far, 2,249,907 tok/sec, ETA: 1.23 hours
Saved ../data/fineweb-edu-sample-10BT/edufineweb_val_000000.npy with shape (100000000,) and dtype uint16
Processed 100,003,591 / 10,000,000,000 tokens so far, 1,888,544 tok/sec, ETA: 1.46 hours
Processed 133,949,951 / 10,000,000,000 tokens so far, 1,971,220 tok/sec, ETA: 1.39 hours
Processed 167,995,258 / 10,000,000,000 tokens so far, 2,025,187 tok/sec, ETA: 1.35 hours
Saved ../data/fineweb-edu-sample-10BT/edufineweb_train_000001.npy with shape (100000000,) and dtype uint16
Processed 200,000,746 / 10,000,000,000 tokens so far, 1,942,744 tok/sec, ETA: 1.40 hours
Processed 235,371,321 / 10,000,000,000 tokens so far, 1,995,558 tok/sec, ETA: 1.36 hours
Processed 270,688,704 / 10,000,000,000 tokens so far, 2,036,048 tok/sec, ETA: 1.33 hours
Saved ../data/fineweb-edu-sample-10BT/edufineweb_train_000002.npy with shape (