In [1]:
import json
import os
from glob import glob
from tqdm import tqdm

In [2]:
files = glob('/home/ubuntu/dedup-text-dataset/*.jsonl')
len(files)

250

In [3]:
def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

In [4]:
wiki = [
    'wikipedia-2023-10-01.jsonl',
    'wikipedia-jawi.jsonl',
]
language_related = [
    'dictionary.jsonl',
    'dewanbahasa-jdbp.jsonl',
    'dialect.jsonl',
    'kamusbm.jsonl',
]
gov_related = [
    'hansard.jsonl',
    'lom.agc.gov.my.jsonl',
    'parlimen-gov.jsonl',
    'data.gov.my.jsonl',
    'mufti_wilayah_articles.jsonl',
    'e-khutbah.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'mufti_perlis_artikel.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'gov.my.jsonl',
    'edu.my.jsonl',
]
research_papers = [
    'academia-edu.jsonl',
    'eprints.jsonl',
]
social_media = [
    'iium-confession.jsonl',
    'b.cari.com.my.jsonl',
    'semisupervised-whisper-large-v2.jsonl',
    'lowyat.jsonl',
    'malay-tweets.jsonl'
]
common_crawl = [
    'common-crawl.jsonl',
    'NLLB.jsonl',
]
buku_teks = [
    'buku-teks.jsonl',
    'bumigemilang.com.jsonl',
    'tcer.my.jsonl',
    'mysoalan.com.jsonl'
]

In [5]:
combine = set(wiki) | set(language_related) | set(gov_related) | set(research_papers) | set(common_crawl) | set(buku_teks)

In [6]:
combine = {os.path.join('/home/ubuntu/dedup-text-dataset', f) for f in combine}

In [9]:
online_articles = sorted(list(set(files) - combine))
len(online_articles)

226

In [10]:
a = open('combine-lm.jsonl', 'w')

In [11]:
for f in wiki:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

438316it [00:03, 111245.29it/s]
722837it [00:36, 19698.75it/s]


In [12]:
for f in language_related:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

54712it [00:01, 32659.31it/s]
4577it [00:00, 6534.38it/s]
66it [00:00, 6777.76it/s]
34192it [00:00, 47483.95it/s]


In [13]:
for f in buku_teks:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass


183it [00:01, 156.23it/s]
16120it [00:24, 667.16it/s] 
3625it [00:01, 2594.03it/s]
900it [00:00, 1924.33it/s]


In [14]:
for f in gov_related:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

140932it [00:17, 8244.93it/s] 
1359it [00:01, 1245.81it/s]
1687it [00:27, 61.99it/s] 
10889it [00:45, 240.46it/s]
1712it [00:00, 2135.28it/s]
809it [00:01, 787.46it/s] 
112it [00:00, 5375.28it/s]
144it [00:00, 7236.58it/s]
112it [00:00, 16045.98it/s]
30055it [01:26, 349.32it/s] 
21590it [00:24, 889.70it/s] 


In [15]:
rejected = [
    'markah untuk setiap satu',
    'soalan mesti dijawab dalam',
    '25 markah',
    '50 markah'
]

In [16]:
for f in research_papers:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                    
                data_lower = data.lower()
                if any([r in data_lower for r in rejected]):
                    continue
                    
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

787it [00:13, 58.82it/s] 
189419it [04:05, 772.71it/s] 


In [21]:
google_translate = glob('../google-translate-*/*.requested')
for f in google_translate:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l)['r']['result'] + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

99962it [00:04, 23719.43it/s]
99972it [00:04, 22830.10it/s]
99959it [00:04, 22308.09it/s]
99967it [00:05, 18634.38it/s]
99968it [00:04, 21707.92it/s]
99966it [00:04, 22896.39it/s]
99968it [00:04, 23066.00it/s]
99971it [00:04, 23476.34it/s]
99968it [00:05, 19613.10it/s]
99966it [00:04, 23849.58it/s]
99971it [00:04, 24017.88it/s]
99962it [00:04, 23989.13it/s]
99958it [00:04, 24042.06it/s]
99968it [00:03, 25382.96it/s]
99972it [00:03, 25880.45it/s]
99967it [00:05, 18903.54it/s]
99959it [00:02, 36181.10it/s]
99960it [00:02, 34426.30it/s]
99963it [00:02, 35347.83it/s]
99966it [00:02, 33672.03it/s]
99972it [00:02, 35816.77it/s]
99965it [00:02, 35511.42it/s]
99968it [00:02, 38051.73it/s]
99966it [00:03, 29586.09it/s]
99965it [00:03, 32357.42it/s]
99968it [00:02, 33773.58it/s]
99969it [00:02, 33699.62it/s]
99981it [00:02, 33466.10it/s]
99974it [00:02, 34037.28it/s]
99967it [00:02, 36187.58it/s]
99970it [00:02, 36296.37it/s]
99972it [00:02, 37056.53it/s]
99969it [00:02, 35155.05it/s]


In [18]:
for f in online_articles:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

14954it [00:01, 10894.68it/s]
172it [00:00, 18637.98it/s]
20763it [00:02, 8385.71it/s]
640it [00:00, 5550.44it/s]
74it [00:00, 1529.45it/s]
1733it [00:00, 6323.22it/s]
33730it [00:01, 17780.16it/s]
87it [00:00, 4190.35it/s]
47it [00:00, 3255.80it/s]
1263it [00:00, 3803.55it/s]
9672it [00:00, 11083.29it/s]
5400it [00:00, 8948.00it/s] 
4307116it [02:47, 25767.21it/s]
4425it [00:00, 13679.06it/s]
269it [00:00, 3357.79it/s]
3645it [00:00, 5728.68it/s]
1589it [00:00, 3652.49it/s]
385it [00:00, 8533.79it/s]
17004it [00:03, 5327.82it/s]
176it [00:00, 3251.67it/s]
233it [00:00, 4424.91it/s]
1427it [00:00, 3931.15it/s]
341it [00:00, 1927.18it/s]
1546it [00:00, 2611.89it/s]
46it [00:00, 3129.72it/s]
140it [00:00, 6238.61it/s]
11678it [00:01, 7692.74it/s]
14354834it [08:03, 29665.45it/s]
15257673it [06:04, 41888.58it/s]
12326it [00:01, 7624.93it/s]
24482it [00:03, 7180.34it/s]
947it [00:00, 3337.98it/s]
39678991it [13:31, 48890.93it/s]
62863it [02:33, 410.56it/s]
512it [00:00, 5580.47it/s]
3506it

In [32]:
madlad_ms = glob('../madlad-400-ms/*')
for f in madlad_ms:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l)['text'] + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

5000000it [10:53, 7655.58it/s] 
4081851it [12:36, 5392.48it/s]
5000000it [15:07, 5509.55it/s]
