In [1]:
# !pip3 install hf-transfer -U

In [2]:
import os

os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

In [3]:
import huggingface_hub
import json
import os
from glob import glob
from tqdm import tqdm

In [4]:
from huggingface_hub import snapshot_download

folder = snapshot_download(repo_id="malaysia-ai/pretrain-text-dataset", repo_type = 'dataset')

In [5]:
files = glob(os.path.join(folder, '*'))
len(files)

313

In [6]:
from huggingface_hub import hf_hub_download

fineweb = hf_hub_download(
    repo_id="mesolitica/smollm-corpus-filter-malaysian-context", 
    filename="fineweb-edu-dedup.jsonl",
    repo_type = 'dataset'
)

In [7]:
social_media = [
    'iium-confession.jsonl',
    'b.cari.com.my.jsonl',
    'semisupervised-whisper-large-v2.jsonl',
    'lowyat.jsonl',
    'malay-tweets.jsonl',
    'c.cari.com.my.jsonl',
    'cn.cari.com.my.jsonl',
    'carigold.jsonl'
]
common_crawl = [
    'common-crawl.jsonl',
    'NLLB.jsonl',
]

rejected = social_media + common_crawl + [
    'pdfdrive.jsonl',
    'seehua.jsonl',
    'hardwarezone-sg.jsonl',
    'sinchew.com.my.jsonl',
    'orientaldaily.com.my.jsonl',
    'wikipedia-20230901.en.filtered.jsonl',
    'cc-100',
    'semisupervised-whisper-large-v2.jsonl',
    'c4-filtered',
    'the-pile',
    'c.cari.com.my',
    'sft-translation.jsonl',
    'README.md',
    'fineweb-edu-dedup-sample-5M.jsonl',
]

online_articles = [x for x in files if all([r not in x for r in rejected])]

In [8]:
selected = online_articles + [fineweb]

In [9]:
sizes = [(os.path.split(s)[1], os.path.getsize(s) / 1e6) for s in selected]

In [10]:
# !pip3 install transformers -U

In [15]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('HuggingFaceTB/SmolLM-360M')

In [12]:
def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

rejected = [
    'markah untuk setiap satu',
    'soalan mesti dijawab dalam',
    '25 markah',
    '50 markah'
]

In [13]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [14]:
from streaming import MDSWriter, LocalDataset
import numpy as np
from streaming.base.format.mds.encodings import Encoding, _encodings

class UInt32(Encoding):
    def encode(self, obj) -> bytes:
        return obj.tobytes()

    def decode(self, data: bytes):
        return np.frombuffer(data, np.uint32)

_encodings['uint32'] = UInt32

columns = {
    'input_ids': 'uint32',
}
compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [16]:
tokenizer = AutoTokenizer.from_pretrained('HuggingFaceTB/SmolLM-360M')
tokenizer.add_bos_token = False
tokenizer.add_eos_token = False

In [19]:
block_size = 4096

def read_dataset(f, block_size = block_size):
    temp = []
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                l = json.loads(l)
            except:
                pass
            try:
                data = '<|endoftext|>' + l + '<|endoftext|>'

                data_lower = data.lower()
                if any([r in data_lower for r in rejected]):
                    continue

                partitioned = partition(data)
                for p in partitioned:
                    tokenized = tokenizer(p)['input_ids']
                    temp.extend(tokenized)
                    while len(temp) >= block_size:
                        block = temp[:block_size]
                        temp = temp[block_size:]
                        if len(block) == block_size:
                            yield np.array(block).astype(np.uint32)


            except Exception as e:
                pass

In [20]:
!mkdir smollm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [21]:
def loop(files):
    files, index = files
    out_root = f'smollm/tokenized-{index}'
    os.system(f'rm -rf {out_root}')
    with MDSWriter(out=out_root, columns=columns, compression=None, hashes=hashes) as out:
        for f in files:
            for block in read_dataset(f = f):
                sample = {
                    'input_ids': block
                }
                out.write(sample)

In [22]:
# loop((files[:1], 0))

In [23]:
import mp

mp.multiprocessing(files, loop, cores = min(len(selected), 4), returned = False)

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
69it [00:00, 667.63it/s]s]huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism t

In [24]:
!du -hs smollm

188G	smollm


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [31]:
folders = glob('smollm/*')
folders

['smollm/tokenized-2',
 'smollm/tokenized-0',
 'smollm/tokenized-1',
 'smollm/tokenized-3',
 'smollm/tokenized-4']

In [32]:
folders = glob('smollm/*')

with MDSWriter(out='combine-smollm', columns=columns, compression=None, hashes=hashes) as out:
    for f in folders:
        try:
            dataset = LocalDataset(local=f)
            for i in tqdm(range(len(dataset))):
                out.write(dataset[i])
        except Exception as e:
            print(e)
            pass

100%|██████████| 3675929/3675929 [15:25<00:00, 3973.04it/s]
100%|██████████| 2125123/2125123 [08:51<00:00, 3999.32it/s]
100%|██████████| 1915066/1915066 [08:02<00:00, 3972.20it/s]
100%|██████████| 4535496/4535496 [19:01<00:00, 3971.74it/s] 
100%|██████████| 16/16 [00:00<00:00, 4428.17it/s]


In [34]:
!du -hs combine-smollm

188G	combine-smollm


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [35]:
!rm -rf smollm

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [36]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         100G   21G   80G  21% /
tmpfs            64M     0   64M   0% /dev
tmpfs           7.7G     0  7.7G   0% /sys/fs/cgroup
/dev/nvme1n1    688G  345G  343G  51% /home/ubuntu
/dev/nvme0n1p1  100G   21G   80G  21% /etc/hosts
shm              64M  4.0K   64M   1% /dev/shm
tmpfs            15G   12K   15G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           7.7G     0  7.7G   0% /proc/acpi
tmpfs           7.7G     0  7.7G   0% /sys/firmware


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [43]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_folder(
    folder_path="combine-smollm",
    repo_id="huseinzol05/smollm-4096-2024-08-31",
    repo_type="dataset",
)