In [1]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('unsloth/Meta-Llama-3.1-8B-Instruct')

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [2]:
from streaming import MDSWriter
from streaming.base.format.mds.encodings import Encoding, _encodings
from streaming import LocalDataset
import streaming
import numpy as np
from tqdm import tqdm
from glob import glob
import os
import json

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
    'position_ids': 'uint32',
    'attention_mask': 'uint32',
}
hashes = 'sha1', 'xxh64'

In [3]:
!mkdir tokenized

mkdir: cannot create directory ‘tokenized’: File exists


In [4]:
combine = []
with open('combined-instructions-language.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        combine.append(l)

897800it [00:06, 135737.12it/s]


In [5]:
import gc

def collator(batch, batch_position_ids):
    input_ids = []
    position_ids = []
    masks = []
    for i in range(len(batch)):
        l = len(batch[i])
        input_ids.extend(batch[i])
        position_ids.extend(batch_position_ids[i])
        masks.append(l)
    
    return {
        'input_ids': np.array(input_ids).astype(np.uint32),
        'position_ids': np.array(position_ids).astype(np.uint32),
        'attention_mask': np.array(masks).astype(np.uint32),
    }

def slice_and_balance(lists, target_length):
    flat_list = [item for sublist in lists for item in sublist]
    main_flat = flat_list[:target_length]
    balance_flat = flat_list[target_length:]

    def to_sublists(flat):
        result, temp = [], []
        for item in flat:
            temp.append(item)
            if len(temp) == len(lists[len(result)]):
                result.append(temp)
                temp = []
        if temp:
            result.append(temp)
        return result

    main_part = to_sublists(main_flat)
    balance_part = to_sublists(balance_flat)

    return main_part, balance_part

In [6]:
block_size = 16384
count = 0
temp = []
position_ids = []
found = False
for row in tqdm(combine[:10000]):
    prompt = tokenizer.apply_chat_template(row, tokenize=False)
    outputs = tokenizer(prompt, add_special_tokens = False)
    temp.append(outputs['input_ids'])
    position_ids.append(range(len(outputs['input_ids'])))
    count += len(outputs['input_ids'])
    while count >= block_size:
        block, temp = slice_and_balance(temp, block_size)
        block_position, position_ids = slice_and_balance(position_ids, block_size)
        count = count - block_size
        found = True
        break
    if found:
        break

  2%|█▍                                                                                     | 165/10000 [00:00<00:03, 2512.83it/s]


In [7]:
%%time

o = collator(block, block_position)

CPU times: user 1.1 ms, sys: 0 ns, total: 1.1 ms
Wall time: 1.03 ms


In [8]:
import time

def loop(files, block_size = 20480):
    rows, index = files
    out_root = f'tokenized/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    count = 0
    temp = []
    position_ids = []
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for row in tqdm(rows):
            prompt = tokenizer.apply_chat_template(row, tokenize=False)
            outputs = tokenizer(prompt, add_special_tokens = False)
            temp.append(outputs['input_ids'])
            position_ids.append(range(len(outputs['input_ids'])))
            count += len(outputs['input_ids'])
            while count >= block_size:
                block, temp = slice_and_balance(temp, block_size)
                block_position, position_ids = slice_and_balance(position_ids, block_size)
                count = count - block_size
                o = collator(block, block_position)
                out.write(o)

In [9]:
import mp

mp.multiprocessing(combine, loop, cores = 10, returned = False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

In [10]:
folders = sorted(glob('tokenized/tokenized-*'), key = lambda x: int(x.split('-')[-1]))
folders

['tokenized/tokenized-0',
 'tokenized/tokenized-1',
 'tokenized/tokenized-2',
 'tokenized/tokenized-3',
 'tokenized/tokenized-4',
 'tokenized/tokenized-5',
 'tokenized/tokenized-6',
 'tokenized/tokenized-7',
 'tokenized/tokenized-8',
 'tokenized/tokenized-9',
 'tokenized/tokenized-10']

In [11]:
!rm -rf packing

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [12]:
with MDSWriter(out='packing', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|█████████████████████████████████████████████████████████████████████████████████████████| 957/957 [00:00<00:00, 2542.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 1469/1469 [00:00<00:00, 2617.30it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 3308/3308 [00:01<00:00, 2104.15it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 5728/5728 [00:02<00:00, 2123.77it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████| 742/742 [00:00<00:00, 3180.91it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 1054/1054 [00:00<00:00, 1788.32it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████| 3397/3397 [00:01<00:00, 2342.60it/s]
100%|██████████████████████████████████████████████████████████████████████████████

index -1 is out of bounds for axis 0 with size 0


In [13]:
dataset[-1]['input_ids']

array([104823,    150,     97, ...,  39287,   8826,   2004], dtype=uint32)

In [24]:
dataset = LocalDataset('packing')

In [15]:
(len(dataset) * 20480) / 1e9

0.58191872

In [16]:
(len(dataset) * 20480) / 1e9

0.58191872

In [17]:
from huggingface_hub import create_repo, delete_repo

try:
    delete_repo(repo_id="mesolitica/malaysian-llama3.1-24k-language-multipack", repo_type="dataset")
except:
    pass
create_repo("mesolitica/malaysian-llama3.1-24k-language-multipack", repo_type="dataset", private = True)

RepoUrl('https://huggingface.co/datasets/mesolitica/malaysian-llama3.1-24k-language-multipack', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/malaysian-llama3.1-24k-language-multipack')

In [18]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="packing",
    repo_id="mesolitica/malaysian-llama3.1-24k-language-multipack",
    repo_type="dataset",
)

shard.00002.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00004.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00000.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00001.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00003.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

Upload 70 LFS files:   0%|          | 0/70 [00:00<?, ?it/s]

shard.00005.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00006.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00007.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00008.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00009.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00010.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00011.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00012.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00013.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00014.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00015.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00016.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00017.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00018.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00019.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00020.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00021.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00022.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00023.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00024.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00025.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00026.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00027.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00028.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00029.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00030.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00031.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00032.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00033.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00034.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00035.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00036.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00037.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00038.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00039.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00040.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00041.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00042.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00043.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00044.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00045.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00046.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00047.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00048.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00049.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00050.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00051.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00052.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00053.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00054.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00055.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00056.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00057.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00058.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00059.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00060.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00061.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00062.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00063.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00064.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00065.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00066.mds:   0%|          | 0.00/67.1M [00:00<?, ?B/s]

shard.00067.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00068.mds:   0%|          | 0.00/67.0M [00:00<?, ?B/s]

shard.00069.mds:   0%|          | 0.00/33.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/malaysian-llama3.1-24k-language-multipack/commit/c4c258c3ff992aa1fde39fa1a2dd0a61b6cab788', commit_message='Upload folder using huggingface_hub', commit_description='', oid='c4c258c3ff992aa1fde39fa1a2dd0a61b6cab788', pr_url=None, pr_revision=None, pr_num=None)