In [1]:
import json
import os
from glob import glob
from tqdm import tqdm

In [2]:
files = glob('/home/ubuntu/dedup-text-dataset/*.jsonl')
len(files)

274

In [12]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained('TinyLlama/TinyLlama-1.1B-intermediate-step-955k-token-2T')

Downloading tokenizer_config.json:   0%|          | 0.00/776 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [3]:
def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

In [4]:
wiki = [
    'wikipedia-2023-10-01.jsonl',
    'wikipedia-jawi.jsonl',
]
language_related = [
    'dictionary.jsonl',
    'dewanbahasa-jdbp.jsonl',
    'dialect.jsonl',
    'kamusbm.jsonl',
]
gov_related = [
    'hansard.jsonl',
    'lom.agc.gov.my.jsonl',
    'parlimen-gov.jsonl',
    'data.gov.my.jsonl',
    'mufti_wilayah_articles.jsonl',
    'e-khutbah.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'mufti_perlis_artikel.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'gov.my.jsonl',
    'edu.my.jsonl',
]
research_papers = [
    'academia-edu.jsonl',
    'eprints.jsonl',
]
social_media = [
    'iium-confession.jsonl',
    'b.cari.com.my.jsonl',
    'semisupervised-whisper-large-v2.jsonl',
    'lowyat.jsonl',
    'malay-tweets.jsonl',
    'c.cari.com.my.jsonl',
    'cn.cari.com.my.jsonl',
    'carigold.jsonl'
]
common_crawl = [
    'common-crawl.jsonl',
    'NLLB.jsonl',
]
buku_teks = [
    'buku-teks.jsonl',
    'bumigemilang.com.jsonl',
    'tcer.my.jsonl',
    'mysoalan.com.jsonl'
]

In [5]:
combine = set(wiki) | set(language_related) | set(gov_related) | set(research_papers) | set(common_crawl) | set(buku_teks)

In [6]:
combine = {os.path.join('/home/ubuntu/dedup-text-dataset', f) for f in combine}

In [7]:
online_articles = sorted(list(set(files) - combine))
len(online_articles)

250

In [8]:
rejected = social_media + common_crawl + [
    'pdfdrive.jsonl',
    'seehua.jsonl',
    'hardwarezone-sg.jsonl',
    'sinchew.com.my.jsonl',
    'orientaldaily.com.my.jsonl'
]

In [9]:
online_articles = [x for x in online_articles if 'c4-filtered' not in x and 'the-pile' not in x and 'c.cari.com.my' not in x]

In [10]:
online_articles = [x for x in online_articles if all([r not in x for r in rejected])]
len(online_articles)

235

In [11]:
a = open('combine-lm.jsonl', 'w')

In [14]:
for f in wiki:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

438316it [00:11, 38763.25it/s]
722837it [00:24, 29056.52it/s]


In [15]:
for f in language_related:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

54712it [00:01, 35170.51it/s]
4577it [00:00, 10573.12it/s]
66it [00:00, 38581.75it/s]
34192it [00:00, 66805.22it/s]


In [16]:
for f in buku_teks:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass


183it [00:05, 35.59it/s] 
16120it [00:10, 1570.56it/s]
3625it [00:00, 4788.09it/s]
900it [00:00, 4059.16it/s]


In [17]:
for f in gov_related:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

140932it [00:18, 7506.65it/s] 
1359it [00:00, 2502.77it/s]
1687it [00:17, 94.42it/s] 
10889it [00:33, 327.95it/s]
1712it [00:00, 4488.22it/s]
809it [00:00, 2420.69it/s]
112it [00:00, 18020.64it/s]
144it [00:00, 15066.72it/s]
112it [00:00, 19968.63it/s]
30055it [00:18, 1630.20it/s]
21590it [00:18, 1149.06it/s]


In [18]:
rejected = [
    'markah untuk setiap satu',
    'soalan mesti dijawab dalam',
    '25 markah',
    '50 markah'
]

In [19]:
for f in research_papers:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                    
                data_lower = data.lower()
                if any([r in data_lower for r in rejected]):
                    continue
                    
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

787it [00:11, 69.17it/s] 
189419it [03:21, 937.88it/s] 


In [20]:
for f in online_articles:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

14954it [00:00, 26596.62it/s]
752it [00:00, 5173.44it/s]
172it [00:00, 21097.86it/s]
20763it [00:01, 15136.84it/s]
640it [00:00, 10217.16it/s]
74it [00:00, 3622.57it/s]
10604it [00:05, 2083.53it/s]
1733it [00:00, 8832.22it/s]
33730it [00:01, 27926.00it/s]
144it [00:00, 7263.91it/s]
359it [00:00, 7618.37it/s]
87it [00:00, 11080.54it/s]
47it [00:00, 6861.07it/s]
1263it [00:00, 5838.91it/s]
9672it [00:00, 12799.40it/s]
5400it [00:03, 1482.92it/s]
4425it [00:00, 19778.11it/s]
269it [00:00, 9281.27it/s]
3645it [00:00, 10471.55it/s]
1589it [00:00, 7973.39it/s]
385it [00:00, 9406.52it/s]
17004it [00:09, 1785.65it/s]
176it [00:00, 6979.60it/s]
233it [00:00, 10064.81it/s]
48it [00:00, 8489.42it/s]
1427it [00:00, 8115.08it/s]
341it [00:00, 4144.17it/s]
1546it [00:00, 8975.51it/s]
46it [00:00, 12641.72it/s]
140it [00:00, 16995.24it/s]
11678it [00:04, 2602.81it/s]
12326it [00:04, 2558.34it/s]
24482it [00:07, 3158.51it/s] 
947it [00:00, 5592.69it/s]
39678991it [09:06, 72652.82it/s]
512it [00:00, 10