In [1]:
import os
from pathlib import Path
from datasets import load_dataset


local_dir =  "data" 
DATA_CACHE_DIR = os.path.join(Path(), local_dir)
os.makedirs(DATA_CACHE_DIR, exist_ok=True)

# download the dataset
dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", cache_dir=DATA_CACHE_DIR)


README.md:   0%|          | 0.00/25.5k [00:00<?, ?B/s]

Resolving data files:   0%|          | 0/2110 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/98 [00:00<?, ?it/s]

In [22]:
import time
import math
from functools import partial

import multiprocessing
#multiprocessing.set_start_method('fork')   
import concurrent.futures as cf
import numpy as np

import tiktoken

num_cpus = os.cpu_count()
print(f"""
sytem statistics:
-----------------
cpu count: {num_cpus}""")
total_docs = len(dataset)
docs_per_cpu = int(math.ceil(total_docs/num_cpus))
print(f"""
dataset statistics
------------------
documents: {total_docs:,}
docs_per_cpu: {docs_per_cpu}""")

enc = tiktoken.get_encoding("gpt2")

def count_tokens(dataset, enc, idx):
    tokens = enc.encode_ordinary(dataset[idx]['text'])
    return 1

f = partial(count_tokens, dataset, enc)


with cf.ProcessPoolExecutor(max_workers = num_cpus) as ex:
    start = time.time()
    documents = 0
    tokens = 0
    
    for result in ex.map(f, list(range(len(dataset))), chunksize=docs_per_cpu):
        documents += 1
        tokens += result
        documents % 100000 and print(f"processed {documents:,}", end="\r")
        
    print(f"Processed documents in {time.time()-start:0.2f} seconds")
    print(f"Total tokens: {tokens:,}")
    print(f"Total documents: {documents:,}")   


sytem statistics:
-----------------
cpu count: 30

dataset statistics
------------------
documents: 9,672,101
docs_per_cpu: 322404
processed 150,463

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 546,527

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 1,045,739

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 1,535,576

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 2,059,108

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 2,582,119

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 3,106,877

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 3,631,030

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 4,153,166

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 4,674,850

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 5,196,935

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 5,719,124

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 6,239,533

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 6,758,199

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 7,247,007

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 7,772,239

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 8,266,824

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 8,793,236

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



processed 9,317,536

IOPub data rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_data_rate_limit`.

Current values:
ServerApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
ServerApp.rate_limit_window=3.0 (secs)



In [19]:

SHARD_SIZE = int(1e8)
output_dir = "shards"

def tokenize(docs_per_cpu, shard_size, n):
    #print(shard_size, n)
    import os
    import gc
    
    from pathlib import Path
    from datasets import load_dataset
    import tiktoken
    
    # set up tokenizer
    enc = tiktoken.get_encoding("gpt2")
    eot = enc._special_tokens['<|endoftext|>']
    
    # load the dataset
    dataset = load_dataset("HuggingFaceFW/fineweb-edu", "sample-10BT", split="train", cache_dir=os.path.join(Path(), "data"))

    # create the np array
    shard = np.array([eot]*shard_size)
    shard_idx = 1

    docs_processed = 0
    shards = 0
    tokens = 0
    start = n*docs_per_cpu
    if len(dataset) - (n*docs_per_cpu) < docs_per_cpu:
        end = len(dataset)
    else:
        end = start + docs_per_cpu
    #print(start, end)
    for idx, d in enumerate(dataset[start:end]['text']):
        #print(f"{idx:,}", end="\r")
        new_tokens = enc.encode_ordinary(d)
        tokens += len(new_tokens)
        if shard_idx + len(new_tokens) > shard_size:
            #np.savez(os.path.join(Path(), "shards", f"fineweb_{n}_{shards}.npz"), shard)
            shard_idx = 1
            shards += 1
        else:
            shard_idx += len(new_tokens)
            shard[idx:idx+len(new_tokens)] = new_tokens
        docs_processed += 1
    return docs_processed, shards, tokens

t = partial(tokenize, docs_per_cpu, SHARD_SIZE)


with cf.ProcessPoolExecutor(max_workers = num_cpus) as ex:
    start = time.time()
    
    docs_processed = 0
    shards_written = 0
    tokens_generated = 0
    for d, s, t in ex.map(t, range(num_cpus)):
        docs_processed += d
        shards_written += s
        tokens_generated += t
        print(f"docs: {d:,}, shards: {s:,}, tokens: {t:,}")
        
    print(f"Processed documents in {time.time()-start:.2f} seconds")
    assert(sum([r[0] for r in results]) == total_docs)
    print(f"total shards written: {shards_written:,}")
    print(f"total tokens: {tokens_generated:,}")

Exception ignored in: <function tqdm.__del__ at 0x7379a9f12700>Exception ignored in: 
Exception ignored in: Exception ignored in: Exception ignored in: <function tqdm.__del__ at 0x7379a9f12700><function tqdm.__del__ at 0x7379a9f12700><function tqdm.__del__ at 0x7379a9f12700>Exception ignored in: Exception ignored in: Traceback (most recent call last):
Exception ignored in: <function tqdm.__del__ at 0x7379a9f12700>Exception ignored in: Exception ignored in: 
Exception ignored in: Exception ignored in: Exception ignored in: Exception ignored in: <function tqdm.__del__ at 0x7379a9f12700>Exception ignored in: 
<function tqdm.__del__ at 0x7379a9f12700>Exception ignored in: Exception ignored in: <function tqdm.__del__ at 0x7379a9f12700>Exception ignored in: Exception ignored in: Traceback (most recent call last):

Traceback (most recent call last):
<function tqdm.__del__ at 0x7379a9f12700><function tqdm.__del__ at 0x7379a9f12700><function tqdm.__del__ at 0x7379a9f12700><function tqdm.__del__

KeyboardInterrupt: 