In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-8B-Instruct')

In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
!mkdir tokenized

mkdir: cannot create directory ‘tokenized’: File exists


In [4]:
combine = []
with open('combined-instructions-language.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        combine.append(l)

2032625it [00:14, 140510.96it/s]


In [5]:
with open('reasoning_instructions.json') as fopen:
    reasoning = json.load(fopen)
    
combine.extend(reasoning)

In [6]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(nested_list, size):
    first = []
    balance = []
    current_size = 0

    for sublist in nested_list:
        if current_size < size:
            remaining_space = size - current_size
            if len(sublist) <= remaining_space:
                first.append(sublist)
                current_size += len(sublist)
            else:
                first.append(sublist[:remaining_space])
                balance.append(sublist[remaining_space:])
                current_size = size
        else:
            balance.append(sublist)
    
    return first, balance

In [7]:
block_size = 16384
count = 0
temp = []
position_ids = []
found = False
for row in tqdm(combine[:10000]):
    prompt = tokenizer.apply_chat_template(row, tokenize=False)
    outputs = tokenizer(prompt, add_special_tokens = False)
    temp.append(outputs['input_ids'])
    position_ids.append(range(len(outputs['input_ids'])))
    count += len(outputs['input_ids'])
    while count >= block_size:
        block, temp = slice_and_balance(temp, block_size)
        block_position, position_ids = slice_and_balance(position_ids, block_size)
        count = count - block_size
        found = True
        break
    if found:
        break

  0%|                                       | 17/10000 [00:00<00:23, 420.14it/s]


In [8]:
count

687

In [9]:
%%time

o = collator(block, block_position)

CPU times: user 818 µs, sys: 202 µs, total: 1.02 ms
Wall time: 988 µs


In [10]:
import time

def loop(files, block_size = 8192):
    rows, index = files
    out_root = f'tokenized/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    last_block, last_position_block = None, None
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                last_block = block
                last_position_block = block_position
                out.write(o)
                
        block, _ = slice_and_balance(last_block, block_size - count)
        block_position, _ = slice_and_balance(last_position_block, block_size - count)

        block.extend(temp)
        block_position.extend(position_ids)

        o = collator(block, block_position)
        if len(o['input_ids']) == block_size:
            out.write(o)
            return o

In [11]:
loop((combine[:10000], 0))

100%|████████████████████████████████████| 10000/10000 [00:13<00:00, 751.30it/s]


{'input_ids': array([  9188,   2308,    276, ...,    276,     13, 128009], dtype=uint32),
 'position_ids': array([244, 245, 246, ..., 902, 903, 904], dtype=uint32),
 'attention_mask': array([ 651,   73,  399,  838,  977,  889, 1046,  920,  804,  119,   87,
         484,  905], dtype=uint32)}

In [None]:
from multiprocess import Pool
import mp

chunks = mp.chunks(combine, 100000)
pool = Pool(15)
pooled = pool.map(loop, chunks)
pool.close()
pool.join()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [None]:
folders = sorted(glob('tokenized/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

In [None]:
dataset = LocalDataset(local = 'tokenized/tokenized-0')
dataset[0]

In [None]:
!rm -rf packing

In [None]:
with MDSWriter(out='packing', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

In [None]:
dataset = LocalDataset('packing')

In [None]:
from huggingface_hub import create_repo, delete_repo

try:
    delete_repo(repo_id="mesolitica/malaysian-llama3.1-8k-language-multipack", repo_type="dataset")
except:
    pass
create_repo("mesolitica/malaysian-llama3.1-8k-language-multipack", repo_type="dataset", private = True)

In [None]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="packing",
    repo_id="mesolitica/malaysian-llama3.1-8k-language-multipack",
    repo_type="dataset",
)