In [1]:
import json
import os
from glob import glob
from tqdm import tqdm

In [2]:
files = glob('/home/ubuntu/dedup-text-dataset/*.jsonl')
len(files)

243

In [3]:
def partition(text, size = 500):
    splitted = text.split()
    return [' '.join(splitted[i: i + size]) for i in range(0, len(splitted), size)]

In [4]:
wiki = [
    'wikipedia-2023-10-01.jsonl',
    'wikipedia-jawi.jsonl',
]
language_related = [
    'dictionary.jsonl',
    'dewanbahasa-jdbp.jsonl',
    'dialect.jsonl',
    'kamusbm.jsonl',
]
gov_related = [
    'hansard.jsonl',
    'lom.agc.gov.my.jsonl',
    'parlimen-gov.jsonl',
    'data.gov.my.jsonl',
    'mufti_wilayah_articles.jsonl',
    'e-khutbah.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'mufti_perlis_artikel.jsonl',
    'mufti_negeri_sem_artikel.jsonl',
    'gov.my.jsonl',
    'edu.my.jsonl',
]
research_papers = [
    'academia-edu.jsonl',
    'eprints.jsonl',
]
social_media = [
    'iium-confession.jsonl',
    'b.cari.com.my.jsonl',
    'semisupervised-whisper-large-v2.jsonl',
    'lowyat.jsonl',
    'malay-tweets.jsonl'
]
common_crawl = [
    'common-crawl.jsonl',
    'NLLB.jsonl',
]
buku_teks = [
    'buku-teks.jsonl',
    'bumigemilang.com.jsonl',
    'tcer.my.jsonl',
    'mysoalan.com.jsonl'
]

In [5]:
combine = set(wiki) | set(language_related) | set(gov_related) | set(research_papers) | set(common_crawl) | set(buku_teks)

In [6]:
combine = {os.path.join('/home/ubuntu/dedup-text-dataset', f) for f in combine}

In [7]:
online_articles = sorted(list(set(files) - combine))
len(online_articles)

219

In [8]:
a = open('combine-lm.jsonl', 'w')

In [9]:
for f in wiki:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

438316it [00:01, 242214.18it/s]
722837it [00:23, 30409.45it/s]


In [10]:
for f in language_related:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

54712it [00:01, 47221.46it/s]
4577it [00:00, 11497.84it/s]
66it [00:00, 50877.42it/s]
34192it [00:00, 71939.05it/s]


In [11]:
for f in buku_teks:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

183it [00:00, 272.17it/s]
16120it [00:06, 2584.59it/s]
3625it [00:00, 4959.24it/s]
900it [00:00, 4491.31it/s]


In [12]:
for f in gov_related:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

140932it [00:11, 11979.35it/s]
1359it [00:00, 2768.80it/s]
1687it [00:09, 178.78it/s]
10889it [00:12, 899.68it/s]
1712it [00:00, 5081.95it/s]
809it [00:00, 2623.37it/s]
112it [00:00, 17942.86it/s]
144it [00:00, 14951.10it/s]
112it [00:00, 18241.06it/s]
30055it [00:13, 2179.53it/s]
21590it [00:10, 1969.79it/s]


In [13]:
rejected = [
    'markah untuk setiap satu',
    'soalan mesti dijawab dalam',
    '25 markah',
    '50 markah'
]

In [14]:
for f in research_papers:
    f = os.path.join('/home/ubuntu/dedup-text-dataset', f)
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                    
                data_lower = data.lower()
                if any([r in data_lower for r in rejected]):
                    continue
                    
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

787it [00:04, 162.83it/s]
189419it [02:05, 1507.11it/s]


In [15]:
google_translate = glob('google-translate-*/*.requested')
for f in google_translate:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l)['r']['result'] + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

99962it [00:02, 43938.53it/s]
99972it [00:02, 44775.88it/s]
99959it [00:02, 43885.63it/s]
99967it [00:02, 44148.51it/s]
99968it [00:02, 43195.93it/s]
99966it [00:02, 41638.38it/s]
99968it [00:02, 43391.15it/s]
99971it [00:02, 44769.23it/s]
99968it [00:02, 39893.82it/s]
99966it [00:02, 43645.40it/s]
99971it [00:02, 44648.95it/s]
99962it [00:02, 34770.63it/s]
99958it [00:03, 31859.67it/s]
99968it [00:02, 46226.30it/s]
99972it [00:02, 45353.37it/s]
99967it [00:02, 45223.32it/s]
99959it [00:01, 63727.12it/s]
99960it [00:01, 63502.28it/s]
99963it [00:01, 62840.67it/s]
99966it [00:01, 62764.90it/s]
99972it [00:01, 64068.62it/s]
99965it [00:01, 63937.62it/s]
99968it [00:01, 60914.43it/s]
99966it [00:01, 62707.05it/s]
99965it [00:01, 63163.17it/s]
99968it [00:01, 62583.65it/s]
99969it [00:01, 54085.98it/s]
99981it [00:01, 62499.44it/s]
99974it [00:01, 63360.97it/s]
99967it [00:01, 59449.06it/s]
99970it [00:02, 40010.29it/s]
99972it [00:01, 59275.16it/s]
99969it [00:01, 65039.13it/s]


In [16]:
for f in online_articles:
    with open(f) as fopen:
        for l in tqdm(fopen):
            try:
                data = '<s>' + json.loads(l) + '</s>'
                partitioned = partition(data)
                for p in partitioned:
                    data = {
                        'text': p,
                    }
                    a.write(f'{json.dumps(data)}\n')
                    a.flush()
            except:
                pass

14954it [00:00, 22492.22it/s]
20763it [00:01, 14932.98it/s]
640it [00:00, 10236.29it/s]
74it [00:00, 3843.41it/s]
1733it [00:00, 11571.59it/s]
33730it [00:01, 24776.45it/s]
87it [00:00, 11716.69it/s]
47it [00:00, 9134.95it/s]
1263it [00:00, 6502.62it/s]
9672it [00:00, 13227.83it/s]
5400it [00:00, 15399.32it/s]
4307116it [01:24, 51216.28it/s]
4425it [00:00, 18768.99it/s]
269it [00:00, 10215.10it/s]
3645it [00:00, 11097.71it/s]
1589it [00:00, 8320.77it/s]
385it [00:00, 9429.64it/s]
17004it [00:02, 7203.14it/s]
176it [00:00, 6025.66it/s]
233it [00:00, 9475.30it/s]
1427it [00:00, 7472.67it/s]
341it [00:00, 4139.14it/s]
1546it [00:00, 8472.54it/s]
46it [00:00, 12869.40it/s]
140it [00:00, 15839.94it/s]
11678it [00:02, 4499.72it/s]
14354834it [04:19, 55248.35it/s]
15257673it [04:20, 58658.94it/s]
12326it [00:01, 11831.63it/s]
24482it [00:01, 13598.82it/s]
947it [00:04, 215.04it/s] 
39678991it [08:50, 74759.31it/s]
62863it [01:01, 1018.32it/s]
512it [00:00, 10542.49it/s]
3506it [00:00, 8793.02

In [1]:
from transformers import AutoTokenizer

In [2]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/llama-600m-hf-32768-fpf')

Downloading (…)okenizer_config.json:   0%|          | 0.00/725 [00:00<?, ?B/s]

Downloading tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/414 [00:00<?, ?B/s]

In [4]:
tokenizer.decode([13, 13,13])

'\n\n\n'