In [1]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py

from glob import glob
from tqdm import tqdm
import msgspec
import mp
import os
import numpy as np
from streaming import MDSWriter
from transformers import AutoTokenizer

In [2]:
train_file = 'combine-lm.jsonl'

In [3]:
split_by = 5000000

In [4]:
!rm -rf partitions
!mkdir partitions

In [5]:
index = 0
count = 0
a = open(f'partitions/combined-lm-{index}.jsonl', 'w')

with open('combine-lm.jsonl') as fopen:
    for l in tqdm(fopen):
        a.write(l)
        a.flush()
        count += 1
        if count >= split_by:
            a.close()
            index += 1
            count = 0
            a = open(f'partitions/combined-lm-{index}.jsonl', 'w')
            
a.close()

1870895it [02:54, 650.87it/s]  

In [6]:
block_size = 4096

def read_dataset(train_file, block_size = block_size):
    
    tokenizer = AutoTokenizer.from_pretrained(
    'TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T',
    )
    tokenizer.add_bos_token = False
    tokenizer.add_eos_token = False
    text_column_name = 'text'
    temp = []
    with open(train_file) as fopen:
        for l in fopen:
            l = msgspec.json.decode(l)
            tokenized = tokenizer(l['text'])['input_ids']
            temp.extend(tokenized)
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    yield np.array(block).astype(np.uint16)

In [7]:
files = sorted(glob('partitions/combined-lm-*.jsonl'), key = lambda x: int(x.split('-')[-1].replace('.jsonl', '')))
files

In [8]:
next(read_dataset(files[0]))

In [9]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt16(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint16)

_encodings['uint16'] = UInt16

In [10]:
columns = {
    'input_ids': 'uint16',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [11]:
!rm -rf tokenized_indexes
!mkdir tokenized_indexes

In [12]:
def loop(files):
    files, index = files
    out_root = f'tokenized_indexes/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for f in files:
            for block in tqdm(read_dataset(train_file = f)):
                sample = {
                    'input_ids': block
                }
                out.write(sample)

In [13]:
import mp
mp.multiprocessing(files, loop, cores = min(len(files), 30), returned = False)

In [1]:
!du -hs tokenized_indexes

25G	tokenized_indexes
