In [1]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py

from glob import glob
from tqdm import tqdm
import msgspec
import mp
import os
import numpy as np
from streaming import MDSWriter
from transformers import AutoTokenizer

In [2]:
train_file = 'combine-lm.jsonl'

In [3]:
split_by = 500000

In [4]:
!rm -rf partitions
!mkdir partitions

In [5]:
index = 0
count = 0
a = open(f'partitions/combined-lm-{index}.jsonl', 'w')

with open('combine-lm.jsonl') as fopen:
    for l in tqdm(fopen):
        a.write(l)
        a.flush()
        count += 1
        if count >= split_by:
            a.close()
            index += 1
            count = 0
            a = open(f'partitions/combined-lm-{index}.jsonl', 'w')
            
a.close()

24067009it [01:59, 201443.48it/s]


In [7]:
!ls partitions

combined-lm-0.jsonl   combined-lm-24.jsonl  combined-lm-4.jsonl
combined-lm-1.jsonl   combined-lm-25.jsonl  combined-lm-40.jsonl
combined-lm-10.jsonl  combined-lm-26.jsonl  combined-lm-41.jsonl
combined-lm-11.jsonl  combined-lm-27.jsonl  combined-lm-42.jsonl
combined-lm-12.jsonl  combined-lm-28.jsonl  combined-lm-43.jsonl
combined-lm-13.jsonl  combined-lm-29.jsonl  combined-lm-44.jsonl
combined-lm-14.jsonl  combined-lm-3.jsonl   combined-lm-45.jsonl
combined-lm-15.jsonl  combined-lm-30.jsonl  combined-lm-46.jsonl
combined-lm-16.jsonl  combined-lm-31.jsonl  combined-lm-47.jsonl
combined-lm-17.jsonl  combined-lm-32.jsonl  combined-lm-48.jsonl
combined-lm-18.jsonl  combined-lm-33.jsonl  combined-lm-5.jsonl
combined-lm-19.jsonl  combined-lm-34.jsonl  combined-lm-6.jsonl
combined-lm-2.jsonl   combined-lm-35.jsonl  combined-lm-7.jsonl
combined-lm-20.jsonl  combined-lm-36.jsonl  combined-lm-8.jsonl
combined-lm-21.jsonl  combined-lm-37.jsonl  combined-lm-9.jsonl
combined-lm-22.j

In [8]:
block_size = 8192
def read_dataset(train_file, block_size = block_size):
    
    tokenizer = AutoTokenizer.from_pretrained(
        'meta-llama/Meta-Llama-3-8B',
    )
    tokenizer.add_bos_token = False
    tokenizer.add_eos_token = False
    text_column_name = 'text'
    temp = []
    with open(train_file) as fopen:
        for l in fopen:
            l = msgspec.json.decode(l)
            tokenized = tokenizer(l['text'])['input_ids']
            temp.extend(tokenized)
            while len(temp) >= block_size:
                block = temp[:block_size]
                temp = temp[block_size:]
                if len(block) == block_size:
                    yield np.array(block).astype(np.uint32)

In [9]:
files = sorted(glob('partitions/combined-lm-*.jsonl'), key = lambda x: int(x.split('-')[-1].replace('.jsonl', '')))
files

['partitions/combined-lm-0.jsonl',
 'partitions/combined-lm-1.jsonl',
 'partitions/combined-lm-2.jsonl',
 'partitions/combined-lm-3.jsonl',
 'partitions/combined-lm-4.jsonl',
 'partitions/combined-lm-5.jsonl',
 'partitions/combined-lm-6.jsonl',
 'partitions/combined-lm-7.jsonl',
 'partitions/combined-lm-8.jsonl',
 'partitions/combined-lm-9.jsonl',
 'partitions/combined-lm-10.jsonl',
 'partitions/combined-lm-11.jsonl',
 'partitions/combined-lm-12.jsonl',
 'partitions/combined-lm-13.jsonl',
 'partitions/combined-lm-14.jsonl',
 'partitions/combined-lm-15.jsonl',
 'partitions/combined-lm-16.jsonl',
 'partitions/combined-lm-17.jsonl',
 'partitions/combined-lm-18.jsonl',
 'partitions/combined-lm-19.jsonl',
 'partitions/combined-lm-20.jsonl',
 'partitions/combined-lm-21.jsonl',
 'partitions/combined-lm-22.jsonl',
 'partitions/combined-lm-23.jsonl',
 'partitions/combined-lm-24.jsonl',
 'partitions/combined-lm-25.jsonl',
 'partitions/combined-lm-26.jsonl',
 'partitions/combined-lm-27.jsonl',
 '

In [10]:
next(read_dataset(files[0]))

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


array([128000, 100022,  15790, ...,     64,    908,   9379], dtype=uint32)

In [11]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

In [12]:
columns = {
    'input_ids': 'uint32',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [13]:
!rm -rf tokenized_indexes
!mkdir tokenized_indexes

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [14]:
def loop(files):
    files, index = files
    out_root = f'tokenized_indexes/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for f in files:
            for block in tqdm(read_dataset(train_file = f)):
                sample = {
                    'input_ids': block
                }
                out.write(sample)

In [15]:
import mp
mp.multiprocessing(files, loop, cores = min(len(files), 30), returned = False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [16]:
!du -hs tokenized_indexes

32G	tokenized_indexes


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
from streaming import StreamingDataset

dataset = StreamingDataset(local = 'tokenized_indexes/tokenized-0')

Because `predownload` was not specified, it will default to 8*batch_size if batch_size is not None, otherwise 64. Prior to Streaming v0.7.0, `predownload` defaulted to max(batch_size, 256 * batch_size // num_canonical_nodes).


In [21]:
len(dataset)

11787

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'meta-llama/Meta-Llama-3-8B',
)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
tokenizer.decode(dataset[0]['input_ids'])

'<|begin_of_text|>Bahasa Melayu (Tulisan Jawi: بهاس ملايو; Rejang: ꤷꥁꤼ ꤸꥍꤾꤿꥈ) ialah salah satu daripada bahasa-bahasa Melayu-Polinesia di bawah keluarga bahasa Austronesia, yang merupakan bahasa rasmi di Brunei, Indonesia, Malaysia dan Singapura, serta dituturkan di Timor Leste dan sebahagian wilayah di Kemboja, Filipina dan Thailand. Jumlah penutur bahasa Melayu mencakupi lebih daripada 290 juta penutur (seramai 260 juta orang bertutur bahasa Indonesia) merentasi kawasan maritim Asia Tenggara. Sebagai salah satu daripada bahasa-bahasa yang paling luas digunakan di Asia Tenggara, bahasa Melayu mempunyai istilah perundangan yang berbeza di negara-negara terlibat bergantung pada sejarah dan budaya penggunaan bahasa Melayu di negara-negara tersebut. Di Malaysia, istilah "bahasa Melayu" ialah istilah "de jure" untuk pentakrifan rasmi bahasa kebangsaan negara Malaysia, manakala istilah "bahasa Malaysia" atau "bahasa Melayu Malaysia" seringkali digunakan mewakili perkara yang sama secara tid