In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('google/gemma-3-27b-it')

In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
!rm -rf tokenized-8k
!mkdir tokenized-8k

In [4]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/combine/combined-malaysian-sft.jsonl

In [5]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-SFT/resolve/main/extra/translation-instructions.json

In [6]:
combine = []
with open('combined-malaysian-sft.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        combine.append(l)

len(combine)

1294946

In [7]:
with open('translation-instructions.json') as fopen:
    translation = json.load(fopen)
    
for d in translation:
    combine.append([
        {'role': 'user', 'content': d['input']},
        {'role': 'assistant', 'content': d['output']}
    ])
    
len(combine)

1364946

In [8]:
row = combine[-1]
prompt = tokenizer.apply_chat_template(row, tokenize=False)
outputs = tokenizer(prompt, add_special_tokens = False)

In [9]:
prompt

'<bos><start_of_turn>user\nterjemah ke bahasa melayu `MRI atau Magnetic Resonance Imaging adalah teknik pencitraan medis yang memanfaatkan magnet kuat dan gelombang radio untuk menghasilkan gambaran detail struktur tubuh internal. Metode ini memiliki kemampuan untuk membuat gambar tiga dimensi dengan detail yang lebih akurat dan lebih tajam dari teknik pencitraan medis lainnya seperti x-ray dan CT Scan.\n\nPada dasarnya, MRI menggunakan satuan magnetik bernama tesla untuk membentuk gambaran. Alat MRI berisi magnet superkonduktor yang menghasilkan medan magnet kuat yang melalui tubuh pasien. Keberadaan medan magnet ini akan menyebabkan medan magnetik pada proton-proton di dalam tubuh yang mendukung pembentukan gambar. Ketika pasien berada di dalam ruang magnetik, gelombang radio dengan frekuensi tertentu ditembakkan ke tubuh, melalui sebuah koil, yang kemudian akan merangsang proton-proton tersebut sehingga terbentuk sinyal. Informasi ini kemudian diolah oleh komputer yang menghasilkan 

In [10]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [11]:
import time

def loop(files, block_size = 8192):
    rows, index = files
    out_root = f'tokenized-8k/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [13]:
loop((combine[:1000], 0))

100%|███████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 1418.27it/s]


{'input_ids': array([236764,   2868,   5995, ..., 236761,    106,    107], dtype=uint32),
 'position_ids': array([102, 103, 104, ..., 287, 288, 289], dtype=uint32),
 'attention_mask': array([485, 254,  94, 518, 210, 491, 685, 192, 439, 581, 221, 487, 462,
        771, 687, 292, 440, 593, 290], dtype=uint32)}

In [14]:
from multiprocess import Pool

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

chunks = chunks(combine, 50000)
pool = Pool(10)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [15]:
folders = sorted(glob('tokenized-8k/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-8k/tokenized-0',
 'tokenized-8k/tokenized-1',
 'tokenized-8k/tokenized-2',
 'tokenized-8k/tokenized-3',
 'tokenized-8k/tokenized-4',
 'tokenized-8k/tokenized-5',
 'tokenized-8k/tokenized-6',
 'tokenized-8k/tokenized-7',
 'tokenized-8k/tokenized-8',
 'tokenized-8k/tokenized-9',
 'tokenized-8k/tokenized-10',
 'tokenized-8k/tokenized-11',
 'tokenized-8k/tokenized-12',
 'tokenized-8k/tokenized-13',
 'tokenized-8k/tokenized-14',
 'tokenized-8k/tokenized-15',
 'tokenized-8k/tokenized-16',
 'tokenized-8k/tokenized-17',
 'tokenized-8k/tokenized-18',
 'tokenized-8k/tokenized-19',
 'tokenized-8k/tokenized-20',
 'tokenized-8k/tokenized-21',
 'tokenized-8k/tokenized-22',
 'tokenized-8k/tokenized-23',
 'tokenized-8k/tokenized-24',
 'tokenized-8k/tokenized-25',
 'tokenized-8k/tokenized-26',
 'tokenized-8k/tokenized-27']

In [16]:
!rm -rf packing-8k

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
with MDSWriter(out='packing-8k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|███████████████████████████████████████████████████████████████████████████████████| 3319/3319 [00:00<00:00, 5083.50it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 3774/3774 [00:00<00:00, 5890.06it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 6353/6353 [00:01<00:00, 4505.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 4910/4910 [00:00<00:00, 5772.00it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5269/5269 [00:01<00:00, 4408.69it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5006/5006 [00:00<00:00, 5538.79it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 8482/8482 [00:01<00:00, 4782.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████| 5242/5242 [00:01<00:0

In [21]:
dataset = LocalDataset('packing-8k')
(len(dataset) * 8192) / 1e9

0.938491904

In [22]:
tokenizer.decode(dataset[-3]['input_ids'])

'kan seperti lautan yang gelap" adalah contoh lain dari bagaimana kata sifat "lautan" dapat digunakan untuk menggambarkan sesuatu yang lain.` terjemah ke malay<end_of_turn>\n<start_of_turn>model\nTubuh air yang boleh digunakan sebagai kata sifat untuk menggambarkan sesuatu yang lain ialah "lautan". Sebagai contoh, istilah "warna laut" boleh digunakan untuk menggambarkan warna biru kehijauan yang serupa dengan warna air laut. Ungkapan "suara ombak" boleh digunakan untuk menggambarkan bunyi gemuruh perasaan yang serupa dengan bunyi yang dihasilkan oleh gelombang laut. "Tenang seperti laut", "mendalam seperti laut", dan "menakutkan seperti lautan gelap" adalah satu lagi contoh bagaimana kata sifat "laut" boleh digunakan untuk menggambarkan sesuatu yang lain.<end_of_turn>\n<bos><start_of_turn>user\n`Untuk menghitung dan memplotting spektrum frekuensi gelombang suara dengan menggunakan transformasi Fourier di Python, Anda dapat mengikuti langkah-langkah berikut:\n\n1. Mengimpor modul numpy 