In [1]:
# !pip3 install hf-transfer -U

In [2]:
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [3]:
import huggingface_hub
import json
import os
from glob import glob
from tqdm import tqdm

In [4]:
from huggingface_hub import snapshot_download

folder = snapshot_download(repo_id="malaysia-ai/pretrain-text-dataset", repo_type = 'dataset')

In [5]:
files = glob(os.path.join(folder, '*'))
len(files)

313

In [6]:
from huggingface_hub import hf_hub_download

fineweb = hf_hub_download(
    repo_id="mesolitica/smollm-corpus-filter-malaysian-context", 
    filename="fineweb-edu-dedup.jsonl",
    repo_type = 'dataset'
)

In [7]:
social_media = [
    'iium-confession.jsonl',
    'b.cari.com.my.jsonl',
    'semisupervised-whisper-large-v2.jsonl',
    'lowyat.jsonl',
    'malay-tweets.jsonl',
    'c.cari.com.my.jsonl',
    'cn.cari.com.my.jsonl',
    'carigold.jsonl'
]
common_crawl = [
    'common-crawl.jsonl',
    'NLLB.jsonl',
]

rejected = social_media + common_crawl + [
    'pdfdrive.jsonl',
    'seehua.jsonl',
    'hardwarezone-sg.jsonl',
    'sinchew.com.my.jsonl',
    'orientaldaily.com.my.jsonl',
    'wikipedia-20230901.en.filtered.jsonl',
    'cc-100',
    'semisupervised-whisper-large-v2.jsonl',
    'c4-filtered',
    'the-pile',
    'c.cari.com.my',
    'sft-translation.jsonl',
    'README.md',
    'fineweb-edu-dedup-sample-5M.jsonl',
]

online_articles = [x for x in files if all([r not in x for r in rejected])]

In [8]:
selected = online_articles + [fineweb]

In [9]:
sizes = [(os.path.split(s)[1], os.path.getsize(s) / 1e6) for s in selected]

In [10]:
# !pip3 install transformers -U

In [11]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-0.5B')

In [12]:
def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

rejected = [
    'markah untuk setiap satu',
    'soalan mesti dijawab dalam',
    '25 markah',
    '50 markah'
]

In [13]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [27]:
from streaming import MDSWriter, LocalDataset
import numpy as np
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [15]:
tokenizer = AutoTokenizer.from_pretrained('Qwen/Qwen2-0.5B')
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False

In [16]:
block_size = 4096

def read_dataset(f, block_size = block_size):
    temp = []
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                l = json.loads(l)
            except:
                pass
            try:
                data = '<|endoftext|>' + l + '<|endoftext|>'

                data_lower = data.lower()
                if any([r in data_lower for r in rejected]):
                    continue

                partitioned = partition(data)
                for p in partitioned:
                    tokenized = tokenizer(p)['input_ids']
                    temp.extend(tokenized)
                    while len(temp) >= block_size:
                        block = temp[:block_size]
                        temp = temp[block_size:]
                        if len(block) == block_size:
                            yield np.array(block).astype(np.uint32)


            except Exception as e:
                pass

In [17]:
!mkdir qwen

mkdir: cannot create directory ‘qwen’: File exists


In [18]:
def loop(files):
    files, index = files
    out_root = f'qwen/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for f in files:
            for block in read_dataset(f = f):
                sample = {
                    'input_ids': block
                }
                out.write(sample)

In [19]:
# loop((files[:1], 0))

In [20]:
import mp

mp.multiprocessing(files, loop, cores = min(len(selected), 4), returned = False)

2877it [00:06, 431.78it/s]s]
947it [00:03, 298.76it/s]]s]
391it [00:01, 373.97it/s]/s]
23972it [00:37, 667.83it/s]s]
24411it [00:37, 643.76it/s]s]
3626it [00:04, 861.12it/s]/s]
46it [00:00, 653.32it/s]] /s]
11076it [00:11, 955.67it/s]s]
4577it [00:08, 509.38it/s]/s]
1160it [00:02, 576.20it/s]/s]
1359it [00:11, 113.34it/s]s]]
16443it [00:19, 837.02it/s]s]
51691it [00:47, 1098.92it/s]]
281536it [01:43, 2709.08it/s]
65it [00:00, 179.09it/s]/s]s]
16120it [02:31, 106.52it/s]s]
28it [00:00, 12000.87it/s]
11314it [00:03, 3335.75it/s]]
7363it [00:08, 819.66it/s]s]s]
33730it [00:24, 1402.89it/s]s]
24482it [00:26, 938.82it/s]/s]
13145it [00:15, 823.77it/s]/s]
3578it [00:03, 1068.23it/s]/s]
5747it [00:05, 1101.26it/s]/s]
523it [00:00, 757.36it/s]it/s]
644it [00:01, 611.61it/s]]t/s]
3002884it [04:26, 12008.84it/s]
3019527it [04:28, 11709.76it/s]
233it [00:00, 588.77it/s]it/s]]
3072159it [04:32, 11470.13it/s]Token indices sequence length is longer than the specified maximum sequence length for this

In [22]:
!du -hs qwen

158G	qwen


In [37]:
dataset = LocalDataset('qwen/tokenized-0')
len(dataset)

1906747

In [39]:
folders = glob('qwen/*')

['qwen/tokenized-2',
 'qwen/tokenized-0',
 'qwen/tokenized-1',
 'qwen/tokenized-3',
 'qwen/tokenized-4']

In [40]:
folders = glob('qwen/*')

with MDSWriter(out='combine-all', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 3287073/3287073 [14:48<00:00, 3700.47it/s]
100%|██████████| 1906747/1906747 [08:39<00:00, 3670.58it/s]
100%|██████████| 1524703/1524703 [06:52<00:00, 3695.14it/s]
100%|██████████| 3618487/3618487 [17:31<00:00, 3440.80it/s]
100%|██████████| 13/13 [00:00<00:00, 2515.03it/s]


In [41]:
!du -hs combine-all

158G	combine-all


In [48]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="combine-all",
    repo_id="huseinzol05/qwen2-4096-2024-08-31",
    repo_type="dataset",
)