In [1]:
import glob
import json
import random
from typing import Dict, List

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

from streaming import MDSWriter, LocalDataset

import warnings
warnings.filterwarnings("ignore")

### 1.0 Get num. of rows in News dataset

It's too big so we may just want to get a sample from the News dataset.

In [4]:
folders = sorted(glob.glob('reformatted_dataset/reformatted-*.jsonl'))
folders

['reformatted_dataset/reformatted-common-crawl-qa.jsonl',
 'reformatted_dataset/reformatted-facebook.jsonl',
 'reformatted_dataset/reformatted-hansard-qa.jsonl',
 'reformatted_dataset/reformatted-iium-confession.jsonl',
 'reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl',
 'reformatted_dataset/reformatted-mining-summarization.jsonl',
 'reformatted_dataset/reformatted-news.jsonl',
 'reformatted_dataset/reformatted-twitter.jsonl',
 'reformatted_dataset/reformatted-wikipedia-qa.jsonl']

In [7]:
for no, f in tqdm(enumerate(folders)):
    line_count = 0
    with open(f) as fopen:
        for line in fopen:
            line = json.loads(line)
            if (line == {} or line['query'] == None or line['text'] == None or line['label'] == None 
                or line['query'] == "" or line['text'] == "" or line['label'] not in [0, 1]):
                continue
            line_count += 1
    
    print(f"Num. rows in {f.split('/')[-1]}: {line_count}")

0it [00:00, ?it/s]

Num. rows in reformatted-common-crawl-qa.jsonl: 418978
Num. rows in reformatted-facebook.jsonl: 1155825
Num. rows in reformatted-hansard-qa.jsonl: 254226
Num. rows in reformatted-iium-confession.jsonl: 2070294
Num. rows in reformatted-mining-b-cari-com-my.jsonl: 1360266
Num. rows in reformatted-mining-summarization.jsonl: 2739484
Num. rows in reformatted-news.jsonl: 16892990
Num. rows in reformatted-twitter.jsonl: 3754477
Num. rows in reformatted-wikipedia-qa.jsonl: 658042


As of now, 28 million rows takes 1000 hours. Hence, we'll need to resample everything accordingly. If we aim to train max 10 days, that's around 24 x 10, 240 hours. So around 7 million~ will do hopefully.

To achieve this, lets just take a maximum of 800k rows from each of the 9 datasets. That said, we **won't** need to resample from the following datasets (due to being smaller than 800k):
- `reformatted-common-crawl-qa.jsonl`
- `reformatted-hansard-qa.jsonl`
- `reformatted-wikipedia-qa.jsonl`

In [15]:
lc_common_crawl = 418978*0.10
lc_facebook = 1155825*0.10
lc_hansard = 254226*0.10
lc_iium = 2070294*0.10
lc_bcari = 1360266*0.10
lc_summarization = 2739484*0.10
lc_news = 16892990*0.10
lc_twitter = 3754477*0.10
lc_wikipedia = 658042*0.10

In [16]:
# size of dataset if sample 10% from each one
list_ds = [
    lc_common_crawl,
    lc_facebook,
    lc_hansard, 
    lc_iium,
    lc_bcari,
    lc_summarization,
    lc_news, 
    lc_twitter,
    lc_wikipedia
]

print(sum(list_ds))

2930458.2


### 2.0 Train-test-split & Stratified sampling

In [2]:
folders = sorted(glob.glob('reformatted_dataset/reformatted-*.jsonl'))
folders

['reformatted_dataset/reformatted-common-crawl-qa.jsonl',
 'reformatted_dataset/reformatted-facebook.jsonl',
 'reformatted_dataset/reformatted-hansard-qa.jsonl',
 'reformatted_dataset/reformatted-iium-confession.jsonl',
 'reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl',
 'reformatted_dataset/reformatted-mining-summarization.jsonl',
 'reformatted_dataset/reformatted-news.jsonl',
 'reformatted_dataset/reformatted-twitter.jsonl',
 'reformatted_dataset/reformatted-wikipedia-qa.jsonl']

In [3]:
def tt_split_stratified(df_list: List, name_of_file: str) -> None:
    """
    Function to perform a stratified train-test-split and output
    the value counts for the "label" column.
    
    ### Arguments
    - `df_list`: A list of processed dictionaries with the keys 'text', 'query' and 'label'.
    - `name_of_file`: The name of the file.

    ### Returns
    Doesn't return anything.
    """
    print(name_of_file)
    df = pd.DataFrame(df_list)
    
    # train-test-split & stratified sampling
    X_train, X_test, y_train, y_test = train_test_split(
                    df[['query', 'text']],
                    df['label'],
                    random_state=42,
                    test_size=0.05, # small since Husein perlukan small je
                    stratify=df["label"],
                )

    X_train['label'] = y_train
    X_train['split'] = 'train'

    X_test['label'] = y_test
    X_test['split'] = 'test'

    X_train.to_json(f"train/train-{name_of_file.split('/')[1].split('.')[0]}.jsonl", orient='records', lines=True)
    X_test.to_json(f"test/test-{name_of_file.split('/')[1].split('.')[0]}.jsonl", orient='records', lines=True)

    print(df.label.value_counts())
    print("")

In [4]:
for no, f in enumerate(folders):
    print(f"{no} {f.split('/')[-1]}")
    empty_rows = 0
    if f in [
        'reformatted_dataset/reformatted-common-crawl-qa.jsonl', 
        'reformatted_dataset/reformatted-hansard-qa.jsonl', 
        'reformatted_dataset/reformatted-wikipedia-qa.jsonl',
    ]:
        print('small')
        data_list = []
        with open(f) as fopen:
            for data in tqdm(fopen):
                data = json.loads(data)
                if (data == {} or data['query'] == None or data['text'] == None or data['label'] == None 
                    or data['query'] == "" or data['text'] == "" or data['label'] not in [0, 1]):
                    empty_rows += 1
                    continue
                data_list.append(data)

        tt_split_stratified(df_list=data_list, name_of_file=f)

    else:
        print('too big')
        data_list = []
        with open(f) as fopen:
            # Read all lines from the file
            lines = fopen.readlines()

            # Randomly sample 'sample_size' number of lines due to the following datasets being super big
            sample_size = 900000
            random_sample = random.sample(lines, sample_size)

            for data in tqdm(random_sample):
                data = json.loads(data)
                if (data == {} or data['query'] == None or data['text'] == None or data['label'] == None 
                    or data['query'] == "" or data['text'] == "" or data['label'] not in [0, 1]):
                    empty_rows += 1
                    continue
                data_list.append(data)

        tt_split_stratified(df_list=data_list, name_of_file=f)

0 reformatted-common-crawl-qa.jsonl
small


0it [00:00, ?it/s]

reformatted_dataset/reformatted-common-crawl-qa.jsonl
label
1    209489
0    209489
Name: count, dtype: int64

1 reformatted-facebook.jsonl
too big


  0%|          | 0/900000 [00:00<?, ?it/s]

reformatted_dataset/reformatted-facebook.jsonl
label
0    484100
1    415900
Name: count, dtype: int64

2 reformatted-hansard-qa.jsonl
small


0it [00:00, ?it/s]

reformatted_dataset/reformatted-hansard-qa.jsonl
label
1    127113
0    127113
Name: count, dtype: int64

3 reformatted-iium-confession.jsonl
too big


  0%|          | 0/900000 [00:00<?, ?it/s]

reformatted_dataset/reformatted-iium-confession.jsonl
label
1    585142
0    314858
Name: count, dtype: int64

4 reformatted-mining-b-cari-com-my.jsonl
too big


  0%|          | 0/900000 [00:00<?, ?it/s]

reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl
label
1    485346
0    412845
Name: count, dtype: int64

5 reformatted-mining-summarization.jsonl
too big


  0%|          | 0/900000 [00:00<?, ?it/s]

reformatted_dataset/reformatted-mining-summarization.jsonl
label
0    732259
1    150183
Name: count, dtype: int64

6 reformatted-news.jsonl
too big


  0%|          | 0/900000 [00:00<?, ?it/s]

reformatted_dataset/reformatted-news.jsonl
label
0    748917
1    151083
Name: count, dtype: int64

7 reformatted-twitter.jsonl
too big


  0%|          | 0/900000 [00:08<?, ?it/s]

reformatted_dataset/reformatted-twitter.jsonl
label
1    519796
0    380204
Name: count, dtype: int64

8 reformatted-wikipedia-qa.jsonl
small


0it [00:00, ?it/s]

reformatted_dataset/reformatted-wikipedia-qa.jsonl
label
1    329021
0    329021
Name: count, dtype: int64



### 3.0 Convert train split `.jsonl` data to mosaic/streaming format

In [5]:
folders = sorted(glob.glob('train/train-reformatted-*.jsonl'))
folders

['train/train-reformatted-common-crawl-qa.jsonl',
 'train/train-reformatted-facebook.jsonl',
 'train/train-reformatted-hansard-qa.jsonl',
 'train/train-reformatted-iium-confession.jsonl',
 'train/train-reformatted-mining-b-cari-com-my.jsonl',
 'train/train-reformatted-mining-summarization.jsonl',
 'train/train-reformatted-news.jsonl',
 'train/train-reformatted-twitter.jsonl',
 'train/train-reformatted-wikipedia-qa.jsonl']

In [6]:
# Source: https://docs.mosaicml.com/projects/streaming/en/latest/fundamentals/dataset_conversion_guide.html

columns = {
    'query': 'str',
    'text': 'str',
    'label': 'int8',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

In [7]:
with MDSWriter(out='mosaic-dataset-resampled-7m-train-split', columns=columns, compression=None, hashes=hashes) as out:
    for no, f in enumerate(folders):
        print(f"{no}. {f.split('/')[-1]}")
        count = 0
        with open(f) as fopen:
            for data in tqdm(fopen):
                data = json.loads(data)
                out.write(data)
                count += 1

        print(f"Count: {count}")
        print("")


0. train-reformatted-common-crawl-qa.jsonl


0it [00:00, ?it/s]

Count: 398029

1. train-reformatted-facebook.jsonl


0it [00:00, ?it/s]

Count: 855000

2. train-reformatted-hansard-qa.jsonl


0it [00:00, ?it/s]

Count: 241514

3. train-reformatted-iium-confession.jsonl


0it [00:00, ?it/s]

Count: 855000

4. train-reformatted-mining-b-cari-com-my.jsonl


0it [00:00, ?it/s]

Count: 853281

5. train-reformatted-mining-summarization.jsonl


0it [00:00, ?it/s]

Count: 838319

6. train-reformatted-news.jsonl


0it [00:00, ?it/s]

Count: 855000

7. train-reformatted-twitter.jsonl


0it [00:00, ?it/s]

Count: 855000

8. train-reformatted-wikipedia-qa.jsonl


0it [00:00, ?it/s]

Count: 625139



In [10]:
# testing
test_mosaic_dataset = LocalDataset(local="mosaic-dataset-resampled-7m-train-split")

print(test_mosaic_dataset[153434])

{'label': 1, 'query': 'Apakah rancangan kerajaan berkaitan dengan AWAS/AES?', 'text': 'Contact us 1300 30 4227 | English Bahasa Malaysia 中文 Sell Sort Search Home Buy Cars for sale Used Cars New Cars Recon Cars Kereta Murah Hot Deals CARSOME Certified Certified Pre-owned BMW Premium Selection Sell Sell Your Car Create Ad FREE How to sell your car New Cars New Car Deals NEW New Car Price List Finance Car Loan Car Insurance Car Insurance Comparison Extended Warranty News All News Auto News Insights Reviews Buying Guide Electric Vehicle (EV) Photos Videos Events Event Services Hire us! Become a Partner Upcoming Events Car of The Year 2022 Past Events View All Past Events Sign In Login Login as Dealer / Agent or Register Register Register as Dealer / Agent Sell your car EN EN English BM Bahasa Malaysia CN 中文 Home Buy Cars for sale Used Cars New Cars Recon Cars Kereta Murah Hot Deals CARSOME Certified Certified Pre-owned BMW Premium Selection Sell Sell Your Car Create Ad FREE How to sell you

In [9]:
# num. rows in the dataset
test_mosaic_dataset.size

6376282