In [None]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py

In [2]:
from glob import glob
from tqdm import tqdm
import json
import os
import numpy as np
from transformers import AutoTokenizer
from streaming import MDSWriter
from tqdm import tqdm
import msgspec

In [3]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

In [4]:
columns = {
    'input_ids': 'uint16',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [5]:
split_by = 5000000

In [6]:
!mkdir partitions

mkdir: cannot create directory ‘partitions’: File exists


In [7]:
index = 0
count = 0
a = open(f'partitions/combined-lm-{index}.jsonl', 'w')

with open('prepare-dedup-text-dataset.jsonl') as fopen:
    for l in tqdm(fopen):
        a.write(l)
        a.flush()
        count += 1
        if count >= split_by:
            a.close()
            index += 1
            count = 0
            a = open(f'partitions/combined-lm-{index}.jsonl', 'w')
            
a.close()

194211216it [31:11, 103777.81it/s]


In [8]:
block_size = 4096

def read_dataset(train_file, block_size = block_size):
    
    tokenizer = AutoTokenizer.from_pretrained(
        'malaysia-ai/bpe-tokenizer',
    )
    tokenizer.add_bos_token = False
    tokenizer.add_eos_token = False
    text_column_name = 'text'
    temp = []
    with open(train_file) as fopen:
        for l in fopen:
            l = msgspec.json.decode(l)
            tokenized = tokenizer(l['text'])['input_ids']
            temp.extend(tokenized)
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    yield np.array(block).astype(np.uint16)

In [9]:
files = sorted(glob('partitions/combined-lm-*.jsonl'), key = lambda x: int(x.split('-')[-1].replace('.jsonl', '')))
files

['partitions/combined-lm-0.jsonl',
 'partitions/combined-lm-1.jsonl',
 'partitions/combined-lm-2.jsonl',
 'partitions/combined-lm-3.jsonl',
 'partitions/combined-lm-4.jsonl',
 'partitions/combined-lm-5.jsonl',
 'partitions/combined-lm-6.jsonl',
 'partitions/combined-lm-7.jsonl',
 'partitions/combined-lm-8.jsonl',
 'partitions/combined-lm-9.jsonl',
 'partitions/combined-lm-10.jsonl',
 'partitions/combined-lm-11.jsonl',
 'partitions/combined-lm-12.jsonl',
 'partitions/combined-lm-13.jsonl',
 'partitions/combined-lm-14.jsonl',
 'partitions/combined-lm-15.jsonl',
 'partitions/combined-lm-16.jsonl',
 'partitions/combined-lm-17.jsonl',
 'partitions/combined-lm-18.jsonl',
 'partitions/combined-lm-19.jsonl',
 'partitions/combined-lm-20.jsonl',
 'partitions/combined-lm-21.jsonl',
 'partitions/combined-lm-22.jsonl',
 'partitions/combined-lm-23.jsonl',
 'partitions/combined-lm-24.jsonl',
 'partitions/combined-lm-25.jsonl',
 'partitions/combined-lm-26.jsonl',
 'partitions/combined-lm-27.jsonl',
 '

In [10]:
next(read_dataset(files[1]))

array([    1,   224, 22783, ..., 17580,  1156,   235], dtype=uint16)

In [11]:
!rm -rf tokenized_indexes
!mkdir tokenized_indexes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
def loop(files):
    files, index = files
    out_root = f'tokenized_indexes/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=compression, hashes=hashes) as out:
        for f in files:
            for block in tqdm(read_dataset(train_file = f)):
                sample = {
                    'input_ids': block
                }
                out.write(sample)

In [13]:
import mp
mp.multiprocessing(files, loop, cores = 20, returned = False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

693053it [1:26:16, 140.96it/s]

In [None]:
from streaming import StreamingDataset

total = 0
for f in glob('tokenized_indexes/tokenized-*'):
    dataset = StreamingDataset(local = f)
    total += len(dataset)
total * block_size