In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen3-14B')

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
!rm -rf tokenized-8k-qwen
!mkdir tokenized-8k-qwen

In [6]:
combine = []
with open('combined-malaysian-sft.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        combine.append(l)

len(combine)

1327479

In [7]:
with open('translation-instructions.json') as fopen:
    translation = json.load(fopen)
    
for d in translation:
    combine.append([
        {'role': 'user', 'content': d['input']},
        {'role': 'assistant', 'content': d['output']}
    ])
    
len(combine)

1397479

In [8]:
row = combine[-1]
prompt = tokenizer.apply_chat_template(row, tokenize=False)
outputs = tokenizer(prompt, add_special_tokens = False)

In [9]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [10]:
import time

def loop(files, block_size = 8192):
    rows, index = files
    out_root = f'tokenized-8k-qwen/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [11]:
loop((combine[:1000], 0))

100%|██████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:01<00:00, 864.55it/s]


{'input_ids': array([  1833,  64907,     88, ...,     13, 151645,    198], dtype=uint32),
 'position_ids': array([161, 162, 163, ..., 357, 358, 359], dtype=uint32),
 'attention_mask': array([468, 448, 479, 375, 542, 486, 635, 286, 614, 545, 864, 742, 335,
        480, 677,  72, 144], dtype=uint32)}

In [14]:
from multiprocess import Pool

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

chunks = chunks(combine, 50000)
pool = Pool(10)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [15]:
folders = sorted(glob('tokenized-8k-qwen/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-8k-qwen/tokenized-0',
 'tokenized-8k-qwen/tokenized-1',
 'tokenized-8k-qwen/tokenized-2',
 'tokenized-8k-qwen/tokenized-3',
 'tokenized-8k-qwen/tokenized-4',
 'tokenized-8k-qwen/tokenized-5',
 'tokenized-8k-qwen/tokenized-6',
 'tokenized-8k-qwen/tokenized-7',
 'tokenized-8k-qwen/tokenized-8',
 'tokenized-8k-qwen/tokenized-9',
 'tokenized-8k-qwen/tokenized-10',
 'tokenized-8k-qwen/tokenized-11',
 'tokenized-8k-qwen/tokenized-12',
 'tokenized-8k-qwen/tokenized-13',
 'tokenized-8k-qwen/tokenized-14',
 'tokenized-8k-qwen/tokenized-15',
 'tokenized-8k-qwen/tokenized-16',
 'tokenized-8k-qwen/tokenized-17',
 'tokenized-8k-qwen/tokenized-18',
 'tokenized-8k-qwen/tokenized-19',
 'tokenized-8k-qwen/tokenized-20',
 'tokenized-8k-qwen/tokenized-21',
 'tokenized-8k-qwen/tokenized-22',
 'tokenized-8k-qwen/tokenized-23',
 'tokenized-8k-qwen/tokenized-24',
 'tokenized-8k-qwen/tokenized-25',
 'tokenized-8k-qwen/tokenized-26',
 'tokenized-8k-qwen/tokenized-27']

In [16]:
!rm -rf packing-8k-qwen

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [17]:
with MDSWriter(out='packing-8k-qwen', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|█████████████████████████████████████████████████████████████████████████████████| 4198/4198 [00:00<00:00, 5033.42it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 1547/1547 [00:00<00:00, 7048.21it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8934/8934 [00:01<00:00, 5024.22it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5087/5087 [00:01<00:00, 5016.71it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 5585/5585 [00:01<00:00, 5446.94it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 6521/6521 [00:01<00:00, 4882.48it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9833/9833 [00:01<00:00, 5451.16it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 8839/8839 [00:01<00:00, 4977.41it/s]


In [18]:
dataset = LocalDataset('packing-8k-qwen')
(len(dataset) * 8192) / 1e9

1.11521792

In [19]:
tokenizer.decode(dataset[-3]['input_ids'])

' mental, hubungan sosial, dan waktu untuk mengejar minat dan hobi pribadi. \n\nUntuk dapat mencapai keseimbangan yang baik antara pekerjaan dan kehidupan, penting untuk menetapkan prioritas dan membangun rutinitas yang sehat. Berikut adalah beberapa tips yang dapat membantu:\n\n1. Tetapkan batasan waktu kerja: Tetapkan jam kerja yang tepat dan usahakan untuk menepatinya. Jangan membawa pekerjaan ke rumah atau memeriksa email kerja di waktu-waktu yang seharusnya untuk bersantai.\n\n2. Buat jadwal rutin: Buat jadwal rutin yang mencakup waktu untuk pekerjaan, istirahat, aktivitas fisik, dan kegiatan sosial. Tetapkan waktu untuk memasak, belanja, dan rutinitas rumah lainnya agar tidak terbebani selama hari kerja.\n\n3. Prioritaskan kesehatan fisik: Selalu usahakan mengatur jadwal rutin untuk olahraga dan waktu istirahat agar tidak terlalu capek karena pekerjaan. Ini akan membantu menjaga kesehatan fisik dan juga memberi waktu untuk merenung dan merilekskan diri.\n\n4. Gunakan alat bantu: 