In [1]:
from glob import glob
from streaming import MDSWriter
from streaming import LocalDataset, StreamingDataset
from transformers import default_data_collator, DataCollatorForLanguageModeling
from tqdm import tqdm
import numpy as np

In [2]:
folders = sorted(glob('tokenized_indexes/tokenized-*'), key = lambda x: int(x.split('-')[-1]))

In [3]:
folders.extend(sorted(glob('tokenized_extra/tokenized-*'), key = lambda x: int(x.split('-')[-1])))

In [4]:
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

In [5]:
columns = {
    'input_ids': 'uint32',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [6]:
!rm -rf combine-all

In [11]:
with MDSWriter(out='combine-all', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = StreamingDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 189063/189063 [00:48<00:00, 3928.03it/s]
100%|██████████| 45042/45042 [01:13<00:00, 609.41it/s] 
100%|██████████| 26774/26774 [01:02<00:00, 431.83it/s] 
100%|██████████| 29944/29944 [00:30<00:00, 986.33it/s] 
100%|██████████| 29622/29622 [00:19<00:00, 1485.15it/s]
100%|██████████| 14849/14849 [00:20<00:00, 721.23it/s] 
100%|██████████| 9988/9988 [00:17<00:00, 555.16it/s] 
100%|██████████| 17118/17118 [00:17<00:00, 996.42it/s] 
100%|██████████| 26311/26311 [00:18<00:00, 1398.41it/s]
100%|██████████| 26304/26304 [00:20<00:00, 1268.09it/s]
100%|██████████| 73067/73067 [00:32<00:00, 2232.94it/s]
100%|██████████| 25969/25969 [00:23<00:00, 1119.99it/s]
100%|██████████| 25777/25777 [00:42<00:00, 612.10it/s] 
100%|██████████| 66546/66546 [00:35<00:00, 1850.69it/s]
100%|██████████| 75575/75575 [00:51<00:00, 1461.03it/s]
100%|██████████| 36659/36659 [00:41<00:00, 892.79it/s] 
100%|██████████| 110169/110169 [00:56<00:00, 1961.70it/s]
100%|██████████| 69283/69283 [01:02<00:00, 110

In [19]:
dataset = LocalDataset('combine-all')

In [21]:
(len(dataset) * 16384) / 1e9

15.860596736

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'Qwen/Qwen1.5-0.5B',
)
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [23]:
tokenizer.decode(dataset[0]['input_ids'])

'<|endoftext|>Bahasa Melayu (Tulisan Jawi: بهاس ملايو; Rejang: ꤷꥁꤼ ꤸꥍꤾꤿꥈ) ialah salah satu daripada bahasa-bahasa Melayu-Polinesia di bawah keluarga bahasa Austronesia, yang merupakan bahasa rasmi di Brunei, Indonesia, Malaysia dan Singapura, serta dituturkan di Timor Leste dan sebahagian wilayah di Kemboja , Filipina dan Thailand. Jumlah penutur bahasa Melayu mencakupi lebih daripada 290 juta penutur (seramai 260 juta orang bertutur bahasa Indonesia) merentasi kawasan maritim Asia Tenggara. Sebagai salah satu daripada bahasa-bahasa yang paling luas digunakan di Asia Tenggara, bahasa Melayu mempunyai istilah perundangan yang berbeza di negara-negara terlibat bergantung pada sejarah dan budaya penggunaan bahasa Melayu di negara-negara tersebut. Di Malaysia, istilah "bahasa Melayu" ialah istilah "de jure" untuk pentakrifan rasmi bahasa kebangsaan negara Malaysia, manakala istilah "bahasa Malaysia" atau "bahasa Melayu Malaysia" seringkali digunakan mewakili perkara yang sama secara tidak 