In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2.5-1.5B-Instruct')

In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
combine = []
with open('combined-malaysian-sft.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        combine.append(l)

len(combine)

1294946

In [4]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [5]:
import time

def loop(files, block_size = 3072):
    rows, index = files
    out_root = f'tokenized-4k/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [6]:
from multiprocess import Pool
import mp

chunks = mp.chunks(combine, 50000)
pool = Pool(10)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

100%|██████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:12<00:00, 689.92it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:32<00:00, 539.43it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50000/50000 [01:58<00:00, 421.21it/s]
 60%|█████████████████████████████████████████████████▍                                | 30107/50000 [01:57<01:15, 261.88it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50000/50000 [02:01<00:00, 411.33it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:24<00:00, 2038.30it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 50000/50000 [00:27<00:00, 1839.70it/s]
100%|██████████████████████████████████████████████████████████████████████████████████| 50000/50000 [02:58<00:

In [7]:
folders = sorted(glob('tokenized-4k/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-4k/tokenized-0',
 'tokenized-4k/tokenized-1',
 'tokenized-4k/tokenized-2',
 'tokenized-4k/tokenized-3',
 'tokenized-4k/tokenized-4',
 'tokenized-4k/tokenized-5',
 'tokenized-4k/tokenized-6',
 'tokenized-4k/tokenized-7',
 'tokenized-4k/tokenized-8',
 'tokenized-4k/tokenized-9',
 'tokenized-4k/tokenized-10',
 'tokenized-4k/tokenized-11',
 'tokenized-4k/tokenized-12',
 'tokenized-4k/tokenized-13',
 'tokenized-4k/tokenized-14',
 'tokenized-4k/tokenized-15',
 'tokenized-4k/tokenized-16',
 'tokenized-4k/tokenized-17',
 'tokenized-4k/tokenized-18',
 'tokenized-4k/tokenized-19',
 'tokenized-4k/tokenized-20',
 'tokenized-4k/tokenized-21',
 'tokenized-4k/tokenized-22',
 'tokenized-4k/tokenized-23',
 'tokenized-4k/tokenized-24',
 'tokenized-4k/tokenized-25']

In [8]:
!rm -rf packing-4k

In [9]:
with MDSWriter(out='packing-4k', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|████████████████████████████████████████████████████████████████████████████████| 11471/11471 [00:00<00:00, 17858.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 12223/12223 [00:00<00:00, 19068.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 20018/20018 [00:01<00:00, 16852.81it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 14503/14503 [00:00<00:00, 18389.60it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 15518/15518 [00:00<00:00, 17042.14it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 18024/18024 [00:00<00:00, 18934.74it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 29992/29992 [00:01<00:00, 17510.80it/s]
100%|████████████████████████████████████████████████████████████████████████████████| 19742/19742 [00:01<00:00

In [10]:
dataset = LocalDataset('packing-4k')
(len(dataset) * 3072) / 1e9

1.108306944

In [11]:
tokenizer.decode(dataset[-3]['input_ids'])

' skrip Python yang mampu mengganti kata-kata dengan sinonim dan ungkapan idiomatik dengan sinonim setara dapat dikembangkan. Pengembangan skrip ini dapat membantu perusahaan start-up untuk mencerminkan komitmen mereka terhadap orisinalitas dan kreativitas dalam berkomunikasi melalui teks, serta untuk memantau dampak potensial penggantian sinonim pada makna dan nada keseluruhan teks.<|im_end|>\n<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nMenggunakan Pempamer Stanford, huraikan ayat "She ate the banana" menjadi pokok kebergantungan, pokok konstituensi dan pokok pelabelan peranan semantik. Selain itu, berikan tag Bahagian-dari-Petikan untuk setiap perkataan dalam ayat dan lemma untuk setiap kata kerja. Akhir sekali, buat visualisasi dari pohon penghuraian menggunakan Graphviz., sentiasa respond dalam bahasa indonesia<|im_end|>\n<|im_start|>assistant\nUntuk melakukan parse kalimat "She ate the banana" menggunakan pe

In [12]:
tokenizer.decode(dataset[-2]['input_ids'])

' looping melalui semua kategori, maka kita membuat kategori baru dengan mengambil dua huruf pertama dari nama barang dan menambahkannya ke "kategori". Barang tersebut kemudian ditambahkan ke daftar di kategori baru yang dibuat di "daftar_kategori".\n- Setelah looping selesai, hasil kategorisasi tersedia dalam variabel "daftar_kategori". Looping terakhir digunakan untuk mencetak hasil kategorisasi dalam format yang mudah dibaca (kategori diikuti oleh daftar barang di dalamnya).<|im_end|>\n<|im_start|>system\nYou are Qwen, created by Alibaba Cloud. You are a helpful assistant.<|im_end|>\n<|im_start|>user\nu always reply in indonesian\n\nApakah formula untuk mengira penurunan peratusan permintaan ruang pejabat disebabkan oleh kesan kerja jarak jauh, seperti yang dibincangkan dalam artikel ini: [Kesan Kerja Jauh ke atas Ruang Pejabat Tradisional](https://www.bbc.com/news/business-54050430)? Artikel tersebut menyebut bahawa pandemik COVID-19 telah mempercepatkan trend ke arah kerja jarak j