In [1]:
# !pip3 install transformers msgspec

In [2]:
from glob import glob
from tqdm import tqdm
import json
import os
import numpy as np
from transformers import AutoTokenizer
from tqdm import tqdm
import msgspec
import json
import re

http_errors = [
        "400 Bad Request", "401 Unauthorized", "402 Payment Required", "403 Forbidden", "404 Not Found",
        "405 Method Not Allowed", "406 Not Acceptable", "407 Proxy Authentication Required", "408 Request Timeout",
        "409 Conflict", "410 Gone", "411 Length Required", "412 Precondition Failed", "413 Payload Too Large",
        "414 URI Too Long", "415 Unsupported Media Type", "416 Range Not Satisfiable", "417 Expectation Failed",
        "418 I'm a teapot", "421 Misdirected Request", "422 Unprocessable Entity", "423 Locked", "424 Failed Dependency",
        "425 Too Early", "426 Upgrade Required", "428 Precondition Required", "429 Too Many Requests",
        "431 Request Header Fields Too Large", "451 Unavailable For Legal Reasons", "500 Internal Server Error",
        "501 Not Implemented", "502 Bad Gateway", "503 Service Unavailable", "504 Gateway Timeout",
        "505 HTTP Version Not Supported", "506 Variant Also Negotiates", "507 Insufficient Storage",
        "508 Loop Detected", "510 Not Extended", "511 Network Authentication Required"
    ]

In [3]:
rejected = [
    'Internal Server Error',
    '__NOEDITSECTION__',
    'enter your username and password',
    'forgotten your password',
    'cookies enabled',
    'enable JavaScript in your browser.',
    'The page cannot be displayed',
    'site or edit the error_page',
    'Request unsuccessful',
]

rejected.extend(http_errors)

def replace_multiple(input_string, pattern =r"\s{6,}", replace = '   '):
    return re.sub(pattern, replace, input_string)

def replace(string):
    string = replace_multiple(string.replace('â€¦', '.'))
    string = replace_multiple(string, pattern = r"\.{6,}", replace = '...')
    return string

def reject(string):
    if any([r in string for r in rejected]):
        return True
    return False

In [4]:
def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

In [5]:
files = glob('/home/ubuntu/dedup-text-dataset/*.jsonl')
len(files)

274

In [6]:
wiki = [
    'wikipedia-2023-10-01.jsonl',
    'wikipedia-jawi.jsonl',
    'wikipedia-20230901.en.filtered.jsonl',
]
language_related = [
    'dictionary.jsonl',
    'dewanbahasa-jdbp.jsonl',
    'dialect.jsonl',
    'kamusbm.jsonl',
    'wiktionary-bahasa.jsonl',
]
gov_related = [
    'hansard.jsonl',
    'lom.agc.gov.my.jsonl',
    'parlimen-gov.jsonl',
    'data.gov.my.jsonl',
    'mufti_wilayah_articles.jsonl',
    'e-khutbah.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'mufti_perlis_artikel.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'gov.my.jsonl',
    'edu.my.jsonl',
]
research_papers = [
    'academia-edu.jsonl',
    'eprints.jsonl',
]
social_media = [
    'iium-confession.jsonl',
    'b.cari.com.my.jsonl',
    'semisupervised-whisper-large-v2.jsonl',
    'lowyat.jsonl',
    'malay-tweets.jsonl',
    'c.cari.com.my.jsonl',
    'cn.cari.com.my.jsonl',
    'carigold.jsonl',
    'cc-100.jsonl',
    'salary-sg.jsonl'
]
common_crawl = [
    'common-crawl.jsonl',
    'NLLB.jsonl',
]
buku_teks = [
    'buku-teks.jsonl',
    'bumigemilang.com.jsonl',
    'tcer.my.jsonl',
    'mysoalan.com.jsonl'
]

In [7]:
combine = set(wiki) | set(language_related) | set(gov_related) | set(research_papers) | set(common_crawl) | set(buku_teks)

In [8]:
combine = {os.path.join('/home/ubuntu/dedup-text-dataset', f) for f in combine}

In [9]:
online_articles = sorted(list(set(files) - combine))
len(online_articles)

248

In [10]:
rejected = social_media + common_crawl + [
    'pdfdrive.jsonl',
    'hardwarezone-sg.jsonl',
    'sinchew.com.my.jsonl',
    'orientaldaily.com.my.jsonl',
    'cc-100',
    'news.jsonl'
]

In [11]:
online_articles = [x for x in online_articles if 'c4-filtered' not in x and 'the-pile' not in x and 'c.cari.com.my' not in x]

In [12]:
online_articles = [x for x in online_articles if all([r not in x for r in rejected])]
len(online_articles)

231

In [19]:
block_size = 2048

def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

def read_dataset(tokenizer, train_file, block_size = block_size):
    temp = []
    
    if os.path.exists(train_file) and (os.stat(train_file).st_size / 1024 ** 2) > 1:
        return
    
    with open(f'{train_file}.tokenized', 'w') as fopen_l:
        with open(train_file) as fopen:
            for l in tqdm(fopen):
                l = msgspec.json.decode(l)
                partitioned = partition(l)
                for p in partitioned:
                    tokenized = tokenizer(p)['input_ids']
                    temp.extend(tokenized)
                    while len(temp) >= block_size:
                        block = temp[:block_size]
                        temp = temp[block_size:]
                        if len(block) == block_size:
                            s = tokenizer.decode(block)
                            fopen_l.write(f'{json.dumps(s)}\n')
                            fopen_l.flush()
                        
def loop(files):
    files, _ = files
    tokenizer = AutoTokenizer.from_pretrained(
        'mistralai/Mixtral-8x7B-Instruct-v0.1',
    )
    tokenizer.add_bos_token = False
    tokenizer.add_eos_token = False
    for f in files:
        read_dataset(tokenizer, f)

In [20]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [21]:
import mp
mp.multiprocessing(online_articles, loop, cores = 30, returned = False)

4it [00:00, 39.75it/s]/s]
4it [00:00, 39.72it/s]s]
68it [00:00, 420.27it/s]s]
40it [00:00, 184.06it/s]]]
56it [00:00, 169.91it/s]/s]
46it [00:00, 146.59it/s]
172it [00:00, 503.91it/s]
1362it [00:00, 4315.99it/s]
43it [00:00, 108.65it/s]
70it [00:00, 199.58it/s]s]

70it [00:00, 176.62it/s]
91it [00:00, 218.49it/s]
48it [00:00, 126.30it/s]]
12it [00:00, 25.34it/s]]/s]
42it [00:00, 205.58it/s]
58it [00:00, 98.06it/s]s]s]
24it [00:00, 95.78it/s]]
15it [00:00, 43.56it/s]s]s]
140it [00:00, 248.67it/s]s]
84it [00:00, 91.67it/s]s]s]
168it [00:01, 148.28it/s]s]
297it [00:01, 233.21it/s]s]
118it [00:01, 81.74it/s]]
233it [00:01, 147.74it/s]s]
276it [00:01, 173.21it/s]
19it [00:00, 188.81it/s]]s]
216it [00:01, 129.91it/s]
54it [00:01, 33.67it/s]
473it [00:01, 368.28it/s]]]
8593it [00:01, 4837.27it/s]
48it [00:00, 174.96it/s]
65it [00:01, 44.77it/s]
151it [00:01, 79.93it/s]
37it [00:00, 107.27it/s]/s]
87it [00:00, 153.12it/s]
1578it [00:00, 2885.13it/s]
74it [00:01, 37.65it/s]s]
153it [00:00, 153.

In [33]:
!git clone https://huggingface.co/datasets/malaysia-ai/online-articles-partition
!cp dedup-text-dataset/*.tokenized online-articles-partition