In [1]:
import glob
import json
import random
from typing import Dict, List

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd

from streaming import MDSWriter, LocalDataset

### 1.0 Get num. of rows in News dataset

It's too big so we may just want to get a sample from the News dataset.

In [4]:
folders = sorted(glob.glob('reformatted_dataset/reformatted-*.jsonl'))
folders

['reformatted_dataset/reformatted-common-crawl-qa.jsonl',
 'reformatted_dataset/reformatted-facebook.jsonl',
 'reformatted_dataset/reformatted-hansard-qa.jsonl',
 'reformatted_dataset/reformatted-iium-confession.jsonl',
 'reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl',
 'reformatted_dataset/reformatted-mining-summarization.jsonl',
 'reformatted_dataset/reformatted-news.jsonl',
 'reformatted_dataset/reformatted-twitter.jsonl',
 'reformatted_dataset/reformatted-wikipedia-qa.jsonl']

In [7]:
for no, f in tqdm(enumerate(folders)):
    line_count = 0
    with open(f) as fopen:
        for line in fopen:
            line = json.loads(line)
            if (line == {} or line['query'] == None or line['text'] == None or line['label'] == None 
                or line['query'] == "" or line['text'] == "" or line['label'] not in [0, 1]):
                continue
            line_count += 1
    
    print(f"Num. rows in {f.split('/')[-1]}: {line_count}")

0it [00:00, ?it/s]

Num. rows in reformatted-common-crawl-qa.jsonl: 418978
Num. rows in reformatted-facebook.jsonl: 1155825
Num. rows in reformatted-hansard-qa.jsonl: 254226
Num. rows in reformatted-iium-confession.jsonl: 2070294
Num. rows in reformatted-mining-b-cari-com-my.jsonl: 1360266
Num. rows in reformatted-mining-summarization.jsonl: 2739484
Num. rows in reformatted-news.jsonl: 16892990
Num. rows in reformatted-twitter.jsonl: 3754477
Num. rows in reformatted-wikipedia-qa.jsonl: 658042


As of now, 28 million rows takes 1000 hours. Hence, we'll need to resample everything accordingly. If we aim to train max 10 days, that's around 24 x 10, 240 hours. So around 7 million~ will do hopefully.

To achieve this, lets just take a maximum of 800k rows from each of the 9 datasets. That said, we **won't** need to resample from the following datasets (due to being smaller than 800k):
- `reformatted-common-crawl-qa.jsonl`
- `reformatted-hansard-qa.jsonl`
- `reformatted-wikipedia-qa.jsonl`

In [8]:
lc_common_crawl = 418978
lc_facebook = 1155825
lc_hansard = 254226
lc_iium = 2070294
lc_bcari = 1360266
lc_summarization = 2739484
lc_news = 16892990
lc_twitter = 3754477
lc_wikipedia = 658042

### Convert `.jsonl` data to mosaic/streaming format

In [9]:
folders = sorted(glob.glob('reformatted_dataset/reformatted-*.jsonl'))
folders

['reformatted_dataset/reformatted-common-crawl-qa.jsonl',
 'reformatted_dataset/reformatted-facebook.jsonl',
 'reformatted_dataset/reformatted-hansard-qa.jsonl',
 'reformatted_dataset/reformatted-iium-confession.jsonl',
 'reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl',
 'reformatted_dataset/reformatted-mining-summarization.jsonl',
 'reformatted_dataset/reformatted-news.jsonl',
 'reformatted_dataset/reformatted-twitter.jsonl',
 'reformatted_dataset/reformatted-wikipedia-qa.jsonl']

In [10]:
# Source: https://docs.mosaicml.com/projects/streaming/en/latest/fundamentals/dataset_conversion_guide.html

columns = {
    'query': 'str',
    'text': 'str',
    'label': 'int8',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [11]:
with MDSWriter(out='mosaic-dataset-resampled-7m', columns=columns, compression=None, hashes=hashes) as out:

    random.seed(42)

    for no, f in enumerate(folders):
        print(f"{no} {f.split('/')[-1]}")
        empty_rows = 0
        if f in [
            'reformatted_dataset/reformatted-common-crawl-qa.jsonl', 
            'reformatted_dataset/reformatted-hansard-qa.jsonl', 
            'reformatted_dataset/reformatted-wikipedia-qa.jsonl',
        ]:
            print('small')
            with open(f) as fopen:
                for data in tqdm(fopen):
                    data = json.loads(data)
                    if (data == {} or data['query'] == None or data['text'] == None or data['label'] == None 
                        or data['query'] == "" or data['text'] == "" or data['label'] not in [0, 1]):
                        empty_rows += 1
                        continue
                    out.write(data)
        else:
            print('too big')
            with open(f) as fopen:
                # Read all lines from the file
                lines = fopen.readlines()

                # Randomly sample 'sample_size' number of lines
                sample_size = 800000
                random_sample = random.sample(lines, sample_size)

                for data in tqdm(random_sample):
                    data = json.loads(data)
                    if (data == {} or data['query'] == None or data['text'] == None or data['label'] == None 
                        or data['query'] == "" or data['text'] == "" or data['label'] not in [0, 1]):
                        empty_rows += 1
                        continue
                    out.write(data)

        print("No. of empty rows: ", empty_rows)
        print("")

0 reformatted-common-crawl-qa.jsonl
small


0it [00:00, ?it/s]

No. of empty rows:  0

1 reformatted-facebook.jsonl
too big


  0%|          | 0/800000 [00:00<?, ?it/s]

No. of empty rows:  0

2 reformatted-hansard-qa.jsonl
small


0it [00:00, ?it/s]

No. of empty rows:  2

3 reformatted-iium-confession.jsonl
too big


  0%|          | 0/800000 [00:00<?, ?it/s]

No. of empty rows:  0

4 reformatted-mining-b-cari-com-my.jsonl
too big


  0%|          | 0/800000 [00:00<?, ?it/s]

No. of empty rows:  1595

5 reformatted-mining-summarization.jsonl
too big


  0%|          | 0/800000 [00:00<?, ?it/s]

No. of empty rows:  15610

6 reformatted-news.jsonl
too big


  0%|          | 0/800000 [00:00<?, ?it/s]

No. of empty rows:  0

7 reformatted-twitter.jsonl
too big


  0%|          | 0/800000 [00:09<?, ?it/s]

No. of empty rows:  0

8 reformatted-wikipedia-qa.jsonl
small


0it [00:00, ?it/s]

No. of empty rows:  0



In [12]:
# testing
test_mosaic_dataset = LocalDataset(local="mosaic-dataset-resampled-7m")
test_mosaic_dataset[2]

{'label': 1,
 'query': 'Bagaimana cara membeli produk Camay?',
 'text': 'Skip to content Home Product Info Menu Toggle Camay Repellent Repel Guard Plus+ Camay Foliar Fertilizer About Us Contact Us My account My Agent Menu Toggle Agents Listed Register Agent 0 Main Menu Home Product Info Menu Toggle Camay Repellent Repel Guard Plus+ Camay Foliar Fertilizer About Us Contact Us My account My Agent Menu Toggle Agents Listed Register Agent My Agent Nak beli produk Camay? Jom klik pada senarai ejen-ejen yang berdaftar dengan kami. Klik pada negeri pilihan anda,dan pilih ejen-ejen kegemaran anda mengikut kawasan pilihan anda. + Johor (35) Nama Lokasi Bandar Penawar (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) LAILI BANDAR PENAWAR Bandar Penawar, Kota Tinggi (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) NOR ALYDA BANDAR PENAWAR Batu Pahat (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) KAMILAH BATU PAHAT NURAINI RENGIT NOR IZYANTI BATU PAHAT ZANITA BATU PAHAT Ba

In [13]:
test_mosaic_dataset.size

6114041