In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('unsloth/Llama-3.2-3B-Instruct')

In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
!rm -rf tokenized-4k
!mkdir tokenized-4k

In [4]:
combine = []
with open('combined-malaysian-sft-50-sample-all.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        combine.append(l)

len(combine)

1900

In [5]:
with open('combined-malaysian-reasoning.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        combine.append(l)
        
len(combine)

41655

In [8]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [9]:
import time

def loop(files, block_size = 3072):
    rows, index = files
    out_root = f'tokenized-4k/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [12]:
loop((combine, 0))

100%|██████████████████████████████████████████████████████████████████████████████████| 41655/41655 [04:10<00:00, 166.33it/s]


{'input_ids': array([109067,  32173,  53804, ...,    268,     13, 128009], dtype=uint32),
 'position_ids': array([ 995,  996,  997, ..., 2004, 2005, 2006], dtype=uint32),
 'attention_mask': array([1752,  633,  687], dtype=uint32)}

In [13]:
folders = sorted(glob('tokenized-4k/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized-4k/tokenized-0']

In [14]:
!rm -rf packing-4k-reasoning

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [15]:
with MDSWriter(out='packing-4k-reasoning', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|████████████████████████████████████████████████████████████████████████████████| 60163/60163 [00:03<00:00, 15877.00it/s]


In [16]:
dataset = LocalDataset('packing-4k-reasoning')
(len(dataset) * 3072) / 1e9

0.184820736

In [17]:
tokenizer.decode(dataset[-3]['input_ids'])

'وسِكُمْ وَأَرْجُلَكُمْ إِلَى الْكَعْبَيْنِ\n\nMaksudnya: "Wahai orang-orang yang beriman, apabila kamu hendak mengerjakan sembahyang (padahal kamu berhadas kecil), maka (berwuduklah) iaitu basuhlah muka kamu, dan kedua belah tangan kamu meliputi siku, dan sapulah sebahagian dari kepala kamu, dan basuhlah kedua belah kaki kamu meliputi buku lali."\n\nSurah Al-Maidah (6)\n\nBerbalik kepada soalan di atas, kami nyatakan hukum bagi air liur ada dua. Sekiranya air liur tersebut dari mulut maka hukumnya suci. Manakala, jika ia keluar dari perut maka hukumnya adalah najis. Justeru itu, sebaiknya jika terdapat air liur basi, maka basuhlah bagi mengelakkan was-was.\n\nPerbahasan berkenaan air liur basi dan najis boleh juga di link bawah:\n\n#36 Status Air Liur Basi[1]\n\n#849 Menyentuh Najis Membatalkan Wuduk[2]\n\nOleh itu, kami nyatakan solat saudara sah kerana sebelum solat juga saudara telah membersihkan diri seperti menggosok gigi, berwuduk dan sebagainya.\n\nKami tegaskan di sini bahawa 

In [18]:
tokenizer.decode(dataset[-2]['input_ids'])

'ِّكِ دَيْنٌ أَكُنْتِ قَاضِيَةً؟ اقْضُوا اللَّهَ، فَاللَّهُ أَحَقُّ بِالْوَفَاءِ\n\nMaksudnya: "Seorang wanita daripada (kabilah) Juhainah datang kepada Nabi SAW lalu berkata: \'Sesungguhnya ibuku pernah bernazar untuk mengerjakan haji, namun dia belum mengerjakan haji sehinggalah dia wafat. Adakah aku boleh mengerjakan haji sebagai wakilnya?\' Baginda menjawab: \'Ya, kerjakanlah haji sebagai wakilnya. Apakah pandanganmu bahawa jika ibumu mempunyai hutang, adakah engkau akan melunaskannya? Tunaikanlah hak Allah, kerana sesungguhnya hak Allah lebih berhak untuk dipenuhi.\'"\n\nRiwayat al-Bukhari (1852)\n\nIbn Hajar al-Haitami menyebut, oleh kerana disamakan haji dengan hutang dan juga adanya tuntutan untuk melunaskannya, maka ia menunjukkan kewajipan mengerjakan haji bagi pihak si mati, dan perbelanjaannya diambil daripada harta si mati. Beliau menambah lagi, sekiranya si mati tidak meninggalkan harta pusaka, maka gugur kewajipan haji ke atasnya begitu juga tidak wajib seseorang waris m

In [19]:
from huggingface_hub import create_repo, delete_repo

try:
    delete_repo(repo_id="huseinzol05/llama3.2-reasoning-multipack-3k", repo_type="dataset")
except:
    pass
create_repo("huseinzol05/llama3.2-reasoning-multipack-3k", repo_type="dataset")

RepoUrl('https://huggingface.co/datasets/huseinzol05/llama3.2-reasoning-multipack-3k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='huseinzol05/llama3.2-reasoning-multipack-3k')

In [20]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="packing-4k-reasoning",
    repo_id="huseinzol05/llama3.2-reasoning-multipack-3k",
    repo_type="dataset",
)

shard.00000.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00001.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Upload 23 LFS files:   0%|          | 0/23 [00:00<?, ?it/s]

shard.00003.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00002.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00004.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00005.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00006.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00007.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00008.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00009.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00010.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00011.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00012.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00013.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00014.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00015.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00016.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00017.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00018.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00019.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00020.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00021.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00022.mds:   0%|          | 0.00/3.74M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/huseinzol05/llama3.2-reasoning-multipack-3k/commit/87dc8a51f22437cf409d045415e618c162e798f3', commit_message='Upload folder using huggingface_hub', commit_description='', oid='87dc8a51f22437cf409d045415e618c162e798f3', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/huseinzol05/llama3.2-reasoning-multipack-3k', endpoint='https://huggingface.co', repo_type='dataset', repo_id='huseinzol05/llama3.2-reasoning-multipack-3k'), pr_revision=None, pr_num=None)