In [28]:
import glob
import json
from typing import Dict, List

from tqdm.notebook import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

import numpy as np
import pandas as pd

from streaming import MDSWriter, LocalDataset

### 1.0 Reformat to "str left is query, str right is neg / pos, label 0 / 1"

**NOTE:** 1.1 - 1.6 are from the [title-context-pair](https://huggingface.co/datasets/mesolitica/title-context-pair) dataset, while 1.71 - 1.73 are from [chat-gpt-open-qa](https://huggingface.co/datasets/mesolitica/title-context-pair). Both datasets are from [Mesolitica](https://huggingface.co/mesolitica).

In [2]:
def unpack_dict(data: Dict) -> List:
    """
    General version of the function to reformat .jsonl data 
    into `query`, `text`, `label` format.
    """

    final_data_list = []

    if data.get('pos', None):
        for sub_pos in data['pos']:
            pos_row_dict = {}

            pos_row_dict['query'] = data['query']
            pos_row_dict['text'] = sub_pos
            pos_row_dict['label'] = 1
        
            final_data_list.append(pos_row_dict)

    if data.get('negs', None):
        for sub_neg in data['negs']:
            neg_row_dict = {}

            neg_row_dict['query'] = data['query']
            neg_row_dict['text'] = sub_neg
            neg_row_dict['label'] = 0
        
            final_data_list.append(neg_row_dict)

    return final_data_list

#### 1.1 b-cari-com-my

In [1]:
# !wget https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-b.cari.com.my.jsonl?download=true

In [3]:
data_list = []
with open('mining-b-cari-com-my.jsonl') as fopen:
    for data in tqdm(fopen):
        data_list.append(json.loads(data))

0it [00:00, ?it/s]

In [4]:
data_list[0]

{'negs': ['gigih sungguh yip sampai tepek gambo bagi contoh',
  'Psl laptop ko slo tu mungkin sbb byk startup  program kot ,gune ccleaner blh tuk disable',
  'sedih pakai umobile..blk kg dpt E je..bukak fb pun tak lepas  skrg dah pakai onexox',
  'tq @reeny sudi tag ida..  silalah sampaikan apa2 ucapan pd yg istimewa..',
  'konpem..kalu ader yg masuk..dier pun kene buat promo cam  ko jugak kan..? 4k seminggu konon..'],
 'pos': ['Saya tak tahu, mungkin awak patut tanya TT. Adakah BIL, SIL, dan TT tinggal bersama-sama dengan MIL?',
  "I don't know, maybe you should ask TT. Are BIL, SIL, and TT living together with MIL?"],
 'query': 'erk nak tanya TT la. BIL, SIL and TT dok sekali ker dgn MIL?\nidadzira Post at 12-1-2010 10:55 AM  Tu satu hal. aku pun pening. BIL = Bila, MIL = ? Bila jg kot..'}

In [5]:
# testing
unpack_dict(data_list[0])

[{'query': 'erk nak tanya TT la. BIL, SIL and TT dok sekali ker dgn MIL?\nidadzira Post at 12-1-2010 10:55 AM  Tu satu hal. aku pun pening. BIL = Bila, MIL = ? Bila jg kot..',
  'text': 'Saya tak tahu, mungkin awak patut tanya TT. Adakah BIL, SIL, dan TT tinggal bersama-sama dengan MIL?',
  'label': 1},
 {'query': 'erk nak tanya TT la. BIL, SIL and TT dok sekali ker dgn MIL?\nidadzira Post at 12-1-2010 10:55 AM  Tu satu hal. aku pun pening. BIL = Bila, MIL = ? Bila jg kot..',
  'text': "I don't know, maybe you should ask TT. Are BIL, SIL, and TT living together with MIL?",
  'label': 1},
 {'query': 'erk nak tanya TT la. BIL, SIL and TT dok sekali ker dgn MIL?\nidadzira Post at 12-1-2010 10:55 AM  Tu satu hal. aku pun pening. BIL = Bila, MIL = ? Bila jg kot..',
  'text': 'gigih sungguh yip sampai tepek gambo bagi contoh',
  'label': 0},
 {'query': 'erk nak tanya TT la. BIL, SIL and TT dok sekali ker dgn MIL?\nidadzira Post at 12-1-2010 10:55 AM  Tu satu hal. aku pun pening. BIL = Bila, 

In [6]:
# Save file
max_worker = 10
result_list = []

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_list), max_worker)):
        futures = {executor.submit(unpack_dict, t): t for t in data_list[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                result_list.extend(result)

  0%|          | 0/12552 [00:00<?, ?it/s]

In [9]:
dataset_name = "reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl"

# with open(dataset_name, 'a') as final:
#     json.dump(result_list, final)
#     final.write('\n')

with open(dataset_name, 'a') as final:
    for item in tqdm(result_list):
        json.dump(item, final)
        final.write('\n')

  0%|          | 0/1362998 [00:00<?, ?it/s]

In [2]:
test_bcari = pd.read_json('reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl', lines=True)

test_bcari.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1362998 entries, 0 to 1362997
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   query   1362998 non-null  object
 1   text    1362998 non-null  object
 2   label   1362998 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 31.2+ MB


#### 1.2 facebook

In [12]:
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-facebook.jsonl

In [15]:
data_list = []
with open('original_dataset/mining-facebook.jsonl') as fopen:
    for data in tqdm(fopen):
        data_list.append(json.loads(data))

0it [00:00, ?it/s]

In [16]:
# Save file
max_worker = 10
result_list = []

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_list), max_worker)):
        futures = {executor.submit(unpack_dict, t): t for t in data_list[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                result_list.extend(result)

  0%|          | 0/12463 [00:00<?, ?it/s]

In [17]:
dataset_name = "reformatted_dataset/reformatted-facebook.jsonl"

with open(dataset_name, 'a') as final:
    for item in tqdm(result_list):
        json.dump(item, final)
        final.write('\n')

  0%|          | 0/1155825 [00:00<?, ?it/s]

In [3]:
test_facebook = pd.read_json('reformatted_dataset/reformatted-facebook.jsonl', lines=True)

test_facebook.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1155825 entries, 0 to 1155824
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   query   1155825 non-null  object
 1   text    1155825 non-null  object
 2   label   1155825 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 26.5+ MB


#### 1.3 iium-confession

In [20]:
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-iium-confession.jsonl

In [21]:
data_list = []
with open('original_dataset/mining-iium-confession.jsonl') as fopen:
    for data in tqdm(fopen):
        data_list.append(json.loads(data))

0it [00:00, ?it/s]

In [22]:
# Save file
max_worker = 10
result_list = []

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_list), max_worker)):
        futures = {executor.submit(unpack_dict, t): t for t in data_list[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                result_list.extend(result)

  0%|          | 0/15083 [00:00<?, ?it/s]

In [24]:
dataset_name = "reformatted_dataset/reformatted-iium-confession.jsonl"

with open(dataset_name, 'a') as final:
    for item in tqdm(result_list):
        json.dump(item, final)
        final.write('\n')

  0%|          | 0/2070294 [00:00<?, ?it/s]

In [6]:
test_iium = pd.read_json('reformatted_dataset/reformatted-iium-confession.jsonl', lines=True)

test_iium.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2070294 entries, 0 to 2070293
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   query   2070294 non-null  object
 1   text    2070294 non-null  object
 2   label   2070294 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 47.4+ MB


#### 1.4 twitter

In [26]:
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-twitter.jsonl

In [27]:
data_list = []
with open('original_dataset/mining-twitter.jsonl') as fopen:
    for data in tqdm(fopen):
        data_list.append(json.loads(data))

0it [00:00, ?it/s]

In [28]:
# Save file
max_worker = 10
result_list = []

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_list), max_worker)):
        futures = {executor.submit(unpack_dict, t): t for t in data_list[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                result_list.extend(result)

  0%|          | 0/31661 [00:00<?, ?it/s]

In [30]:
dataset_name = "reformatted_dataset/reformatted-twitter.jsonl"

with open(dataset_name, 'a') as final:
    for item in tqdm(result_list):
        json.dump(item, final)
        final.write('\n')

  0%|          | 0/3754478 [00:00<?, ?it/s]

In [7]:
test_twitter = pd.read_json('reformatted_dataset/reformatted-twitter.jsonl', lines=True)

test_twitter.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3754478 entries, 0 to 3754477
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   query   3754478 non-null  object
 1   text    3754478 non-null  object
 2   label   3754478 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 85.9+ MB


#### 1.5 news

In [39]:
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-news.jsonl

Will not apply HSTS. The HSTS database must be a regular and non-world-writable file.
ERROR: could not open HSTS store at '/home/ubuntu/.wget-hsts'. HSTS will be disabled.
--2024-02-19 12:24:41--  https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-news.jsonl
Resolving huggingface.co (huggingface.co)... 18.238.49.10, 18.238.49.70, 18.238.49.112, ...
Connecting to huggingface.co (huggingface.co)|18.238.49.10|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://cdn-lfs-us-1.huggingface.co/repos/bd/b3/bdb361c75adf0da2f355354377af47fe74b09db79b788061463ceb4a7c8a25e8/9b4b41eea845da335efd28614987a4a799e05dedb395052e02ee456210168a62?response-content-disposition=attachment%3B+filename*%3DUTF-8%27%27mining-news.jsonl%3B+filename%3D%22mining-news.jsonl%22%3B&Expires=1708604682&Policy=eyJTdGF0ZW1lbnQiOlt7IkNvbmRpdGlvbiI6eyJEYXRlTGVzc1RoYW4iOnsiQVdTOkVwb2NoVGltZSI6MTcwODYwNDY4Mn19LCJSZXNvdXJjZSI6Imh0dHBzOi8vY2RuLWxmcy11cy0xLmh1Z2dpbmd

In [40]:
data_list = []
with open('original_dataset/mining-news.jsonl') as fopen:
    for data in tqdm(fopen):
        data_list.append(json.loads(data))

0it [00:00, ?it/s]

In [44]:
data_list[0].keys()

dict_keys(['title', 'body', 'negs'])

In [46]:
data_list[1]

{'title': 'KUCHING: Pasukan Gerakan Am (PGA) Briged Sarawak merampas pelbagai jenis rokok seludup dianggarkan bernilai RM2,104,506.40 dalam serbuan Ops Libas ke sebuah rumah di Tabuan Laru, hari ini.',
 'body': 'Komandernya SAC Mancha Ata berkata, kejayaan mencatatkan rampasan rokok terbesar tahun ini adalah hasil maklumat dan risikan sehingga membawa kepada serbuan ke atas premis yang dijadikan stor penyimpanan rokok belum lulus cukai Kastam sekitar pada jam 9.30 pagi.\xa0\n“Hasil pemeriksaan lanjut, pihak kami menemui sebanyak 9,500 karton (1,900,000 batang) rokok putih dan sejumlah 4,217 karton (694,520 batang) rokok kretek yang disimpan di dalam rumah serta dalam muatan sebuah van yang diletakkan di hadapan rumah berkenaan.\nMenurutnya juga, tangkapan dan rampasan diserahkan kepada pihak Ibu Pejabat Daerah Kuching untuk tindakan dan siasatan lanjut di bawah Seksyen 135 (1) (d) Akta Kastam 1967.\nTambah beliau juga, sepanjang pengoperasian yang dilaksanakan sejak awal tahun sehingga

In [50]:
def unpack_news(data: Dict) -> List:

    final_data_list = []

    pos_row_dict = {}

    pos_row_dict['query'] = data['title']
    pos_row_dict['text'] = data['body']
    pos_row_dict['label'] = 1

    final_data_list.append(pos_row_dict)

    if data.get('negs', None):
        for sub_neg in data['negs']:
            neg_row_dict = {}

            neg_row_dict['query'] = data['title']
            neg_row_dict['text'] = sub_neg
            neg_row_dict['label'] = 0
        
            final_data_list.append(neg_row_dict)

    return final_data_list

In [51]:
unpack_news(data_list[1])

[{'query': 'KUCHING: Pasukan Gerakan Am (PGA) Briged Sarawak merampas pelbagai jenis rokok seludup dianggarkan bernilai RM2,104,506.40 dalam serbuan Ops Libas ke sebuah rumah di Tabuan Laru, hari ini.',
  'text': 'Komandernya SAC Mancha Ata berkata, kejayaan mencatatkan rampasan rokok terbesar tahun ini adalah hasil maklumat dan risikan sehingga membawa kepada serbuan ke atas premis yang dijadikan stor penyimpanan rokok belum lulus cukai Kastam sekitar pada jam 9.30 pagi.\xa0\n“Hasil pemeriksaan lanjut, pihak kami menemui sebanyak 9,500 karton (1,900,000 batang) rokok putih dan sejumlah 4,217 karton (694,520 batang) rokok kretek yang disimpan di dalam rumah serta dalam muatan sebuah van yang diletakkan di hadapan rumah berkenaan.\nMenurutnya juga, tangkapan dan rampasan diserahkan kepada pihak Ibu Pejabat Daerah Kuching untuk tindakan dan siasatan lanjut di bawah Seksyen 135 (1) (d) Akta Kastam 1967.\nTambah beliau juga, sepanjang pengoperasian yang dilaksanakan sejak awal tahun sehing

In [52]:
# Save file
max_worker = 20

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_list), max_worker)):
        futures = {executor.submit(unpack_news, t): t for t in data_list[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                for r in result:
                    with open('reformatted_dataset/reformatted-news.jsonl', 'a') as final:
                            json.dump(r, final)
                            final.write('\n')

  0%|          | 0/141900 [00:00<?, ?it/s]

#### 1.6 summarization

In [33]:
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/title-context-pair/resolve/main/mining-summarization.jsonl

In [56]:
data_list = []
with open('original_dataset/mining-summarization.jsonl') as fopen:
    for data in tqdm(fopen):
        data_list.append(json.loads(data))

0it [00:00, ?it/s]

In [57]:
data_list[0].keys()

dict_keys(['text', 'summary', 'summary_ms', 'negs'])

In [58]:
data_list[0]

{'text': 'doktor.jika saya ada melakukan seks oral buat pertama kali tidak menggunakan kondom.adakah kemungkinan saya dijangkiti penyakit gonorhea dan chlamydia?\n\nHai dan selamat petang untuk anda. Terima kasih atas soalan anda. Jika anda baru melakukan seks oral buat pertama kali, kebarangkalian untuk anda terkena jangkitan penyakit kelamin seperti gonorrhoea dan chlamydia adalah sedikit.\nHal ini kerana hanya pasangan yang mempunyai jangkitan bakteria tersebut sahaja yang boleh menyebabkan anda terkena jangkitan kuman tersebut jika melakukan hubungan intim sama ada secara oral, vaginal atau melalui dubur.\nLangkah-langkah untuk mencegah penyakit kelamin adalah sangat penting untuk kita amalkan dalam kehidupan seharian terutamanya hal yang berkaitan dengan alat kemaluan kita. Pertama sekali, setia dengan pasangan anda. Pesanan ini sangat penting kerana mempunyai pasangan yang satu dapat mengurangkan risiko untuk dijangkiti bakteria-bakteria yang akan menyebabkan penyakit kelamin. Ji

Per Husein, we need to format the data such that it looks like the following (the top row is the dictionary key for the transformed dataset):

| QUERY   | TEXT | LABEL        |
|---------|-----|--------------|
| summary | text | positive / 1 |
| summary_ms | text | positive / 1 |
| summary | negs | negative / 0 |
| summary_ms | negs | negative / 0 |

In [55]:
# we need to transform it to this format:
# EXAMPLE from news
unpack_news(data_list[0])

[{'query': 'Education Ministry urges NGOs to help children without documents',
  'text': ' JOHOR BARU, April 13 — The Education Ministry has called on all non-governmental organisations (NGOs) to help children without identification documents to register with the National Registration Department (NRD) so they can enrol at government schools.\n Its minister, Maszlee Malik said the efforts could be implemented by the NGOs, in collaboration with the National Registration Department (JPN), District Education Office (PPD) and Social Welfare Department (SWD).\n “We find that some of our citizens are without\xa0documents for some reasons which is ultimately preventing them from attending school, so\xa0we hope the NGOs to work with JPN, SWD, PPD to register these children because in the end we want them to be educated.\n “This is in line with the ministry’s motto, education for all,” said Maszlee in a press conference after officiating the Edulife programme organised by Johor Empowerment of In

In [None]:
def process_entry(entry):
    data = []
    
    text = entry["text"]
    summary = entry["summary"]
    summary_ms = entry["summary_ms"]
    negs = entry["negs"]
    
    
    if summary is not None and text is not None:
        first_row = {
            "query": summary,
            "text": text,
            "label": 1
        }
        data.append(first_row)

        
    if summary_ms is not None and text is not None:
        second_row = {
            "query": summary_ms,
            "text": text,
            "label": 1
        }
        data.append(second_row)
    
    
    for neg in negs:
        if summary is not None and neg is not None:
            third_row = {
                "query": summary,
                "text": neg,
                "label": 0
            }
            data.append(third_row)
        
        
        if summary_ms is not None and neg is not None:
            fourth_row = {
                "query": summary_ms,
                "text": neg,
                "label": 0
            }
            data.append(fourth_row)

        
    return data

In [1]:
# Save file
max_worker = 20

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_list), max_worker)):
        futures = {executor.submit(unpack_summarization, t): t for t in data_list[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                for r in result:
                    with open('reformatted_dataset/reformatted-mining-summarization.jsonl', 'a') as final:
                            json.dump(r, final)
                            final.write('\n')

In [8]:
test_summarization = pd.read_json('reformatted_dataset/reformatted-mining-summarization.jsonl', lines=True)

test_summarization.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2793537 entries, 0 to 2793536
Data columns (total 3 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   query   2793537 non-null  object
 1   text    2793537 non-null  object
 2   label   2793537 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 63.9+ MB


#### 1.7 Load datasets from `chatgpt-malaysian-open-qa`

##### 1.7.0 Exploratory Analysis

In [10]:
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/chatgpt-malaysian-open-qa/resolve/main/common-crawl-qa.jsonl
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/chatgpt-malaysian-open-qa/resolve/main/hansard-qa.jsonl
# !cd original_dataset && wget https://huggingface.co/datasets/mesolitica/chatgpt-malaysian-open-qa/resolve/main/wikipedia-qa.jsonl

In [2]:
data_common_crawl = []
with open('original_dataset/common-crawl-qa.jsonl') as fopen:
    for data in tqdm(fopen):
        data_common_crawl.append(json.loads(data))

print("Length of Common Crawl QA: ", len(data_common_crawl))
data_common_crawl[0].keys()

0it [00:00, ?it/s]

Length of Common Crawl QA:  69829


dict_keys(['paragraph', 'qa'])

In [3]:
data_common_crawl[3]

{'paragraph': 'Home About Us Products Project Project List Project Photo Policy Video Contact Us Home About Us Products Project Project List Project Photo Policy Video Contact Us Products Search Manual Products Outdoor Roller Blinds Roller Blinds Roller Blinds System Smooth Curtain tracks Zebra Blinds View All Motorized Products Dooya Roller Blinds Motor View All Camoor Blinds Sdn Bhd 38, Jalan Mivo 1, Taman Perindustrian Desa Aman, 52200 Kuala Lumpur, Malaysia. +6019-229 8138 +603-2178 4888 enquiry@camoor.my +60192298138 Give us a call to find out more about the blinds that we offer. For information on becoming part of Camoor Blinds dealer network, PLEASE CONTACT US. About Us Camoor Blinds Sdn Bhd is a window blinds manufacturer and our business covers the entire Malaysia and Singapore. Our main office is located in Kuala Lumpur, Malaysia. Our business model is B2B (business to business). People centric approach has been always our core value. Because in Camoor Blinds , we believe hav

In [4]:
data_hansard = []
with open('original_dataset/hansard-qa.jsonl') as fopen:
    for data in tqdm(fopen):
        data_hansard.append(json.loads(data))

print("Length of Hansard QA: ", len(data_hansard))
data_hansard[0].keys()

0it [00:00, ?it/s]

Length of Hansard QA:  42368


dict_keys(['paragraph', 'qa'])

In [5]:
data_hansard[0]

{'paragraph': {'original': '\n22                                                            DR.30.10.2018                                                                                                                          \n\n \ndan sebagainya. Cukai-cukai yang didapati daripada minuman keras, alkohol dan sebagainya \nmasuk kepada kerajaan, kerajaan bayar gaji kita balik, itu pun haram pada kita.  [Disampuk]  \n\nTimbalan Yang di-Pertua [Dato’ Mohd Rashid Hasnon]: Yang Berhormat Padang \nSerai, sila ajukan soalan. \n\nTuan Karupaiya Mutusami [Padang Serai]: Saya bertanya soalan pada pihak \nkementerian, bolehkah samsu dihapuskan terus, tidak mahu ada samsu dalam negara kita. \n[Dewan tepuk] Terima kasih. \n\nTuan Che Alias bin Hamid [Kemaman]: Sokong, sokong. \n\n■1150 \n\nTuan Chong Chieng Jen: Itu satu isu atau soalan yang begitu serius dan effect-\nnya yang sangat luas. Saya rasa ini perlu dibawa ke dalam Jemaah Menteri untuk \ndipertimbangkan dan diputuskan. Sekian, terima kas

In [6]:
print(data_hansard[0]['paragraph']['original'])


22                                                            DR.30.10.2018                                                                                                                          

 
dan sebagainya. Cukai-cukai yang didapati daripada minuman keras, alkohol dan sebagainya 
masuk kepada kerajaan, kerajaan bayar gaji kita balik, itu pun haram pada kita.  [Disampuk]  

Timbalan Yang di-Pertua [Dato’ Mohd Rashid Hasnon]: Yang Berhormat Padang 
Serai, sila ajukan soalan. 

Tuan Karupaiya Mutusami [Padang Serai]: Saya bertanya soalan pada pihak 
kementerian, bolehkah samsu dihapuskan terus, tidak mahu ada samsu dalam negara kita. 
[Dewan tepuk] Terima kasih. 

Tuan Che Alias bin Hamid [Kemaman]: Sokong, sokong. 

■1150 

Tuan Chong Chieng Jen: Itu satu isu atau soalan yang begitu serius dan effect-
nya yang sangat luas. Saya rasa ini perlu dibawa ke dalam Jemaah Menteri untuk 
dipertimbangkan dan diputuskan. Sekian, terima kasih. 

 

9. Dr. Hasan bin Bahrom [Tampin] minta 

In [7]:
data_wikipedia = []
with open('original_dataset/wikipedia-qa.jsonl') as fopen:
    for data in tqdm(fopen):
        data_wikipedia.append(json.loads(data))

print("Length of Hansard QA: ", len(data_wikipedia))
data_wikipedia[0].keys()

0it [00:00, ?it/s]

Length of Hansard QA:  44923


dict_keys(['paragraph', 'url', 'qa'])

In [8]:
data_wikipedia[0]

{'paragraph': 'The Legend of Korra ialah sebuah siri animasi televisyen Amerika Syarikat yang telah ditayangkan di rangkaian televisyen Nickelodeon sejak 2012. Siri ni dicipta oleh Bryan Konietzko dan Michael Dante DiMartino sebagai sebuah susulan daripada "", yang telah ditayangkan di Nickelodeon pada tahun 2005 sehingga tahun 2008. Beberapa orang yang berkencimpung dalam penciptaan "Avatar: The Last Airbender", termasuk seorang pereka, Joaquim Dos Santos dan dua orang komposer, Jeremy Zuckerman dan Benjamin Wynn, kembali semula untuk mencipta "The Legend of Korra". Siri ini berlatar belakangkan dunia fiksyen dimana seseorang boleh memanipulasi, atau "bend", bagi elemen air, bumi, api, atau udara. Hanya seorang sahaja, "Avatar", boleh memanipulasi kesemua 4 elemen, dan bertanggungjawab dalam mengekalkan keseimbangan dunia. Siri ini mengisahkan Avatar Korra, penggati Aang daripada siri sebelum ini, sambil menghadapi pergolakan politik dan roh semangat dalam dunia pemodenan. Siri ini ya

In [33]:
def unpack_chatgptqa(
        row: Dict,
        data_neg: List,
        hansard_neg: bool = False,
        hansard_pos: bool = False,
    ) -> Dict:
    """
    Function to reformat .jsonl data from https://huggingface.co/datasets/mesolitica/chatgpt-malaysian-open-qa
    into `query`, `text`, `label` format.

    For positive pairs, we take from the 'paragraph'/'paragraph''original' keys as "text".
    However, for negative pairs, we take from the 'paragraph'/'paragraph''original' keys from the other dataset.

    For example, if we're processing the rows from Hansard, we'd use 'paragraph'/'paragraph''original' keys
    from Wikipedia. Which dataset to use is done via the `data_neg` parameter.

    We do this mainly to ensure that the negative pair is unrelated as possible. Future versions of this function
    can perhaps incorporate a word error rate calculation e.g., https://thepythoncode.com/article/calculate-word-error-rate-in-python

    ### Arguments
    - `row`: Original row from `.jsonl` dataset. It is of the dictionary data type.
    - `data_neg`: A list of the dataset we'll use to source a negative pair from.
    - `hansard_neg`: Whether the Hansard QA dataset is used to source the negative pair. Default is False. 
    This parameter is included due to having a different format than the rest of the datasets (Common Crawl QA & Wikipedia QA)
    - `hansard_pos`: Whether we are preprocessing the Hansard QA dataset or not. Default is False.
    This parameter is included due to having a different format than the rest of the datasets (Common Crawl QA & Wikipedia QA)
    """

    final_data_list = []

    for pos in row['qa']['qa']:
        random_number = np.random.randint(low=1, high=len(data_neg))

        pos_row_dict = {}
        neg_row_dict = {}

        pos_row_dict['query'] = pos['question']
        if hansard_pos == False:
            pos_row_dict['text'] = row['paragraph']
        else:
            pos_row_dict['text'] = row['paragraph']['original']
        pos_row_dict['label'] = 1

        neg_row_dict['query'] = pos_row_dict['query']
        if hansard_neg == False:
            # we infer the negative text by taking a random one from either commmon crawl of wikipedia
            neg_row_dict['text'] = data_neg[random_number]['paragraph']
        else:
            # we infer the negative text by taking a random one from hansard
            neg_row_dict['text'] = data_neg[random_number]['paragraph']['original']
        neg_row_dict['label'] = 0
    
        final_data_list.append(pos_row_dict)
        final_data_list.append(neg_row_dict)
    
    return final_data_list

In [34]:
unpack_chatgptqa(data_common_crawl[0], data_wikipedia, False, False)

[{'query': 'Siapakah yang berada di Jengka Jerantut?',
  'text': 'Google Map) MAZLAN MOKHTAR JENGKA Jerantut (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) MOHD NAJID JERANTUT Kuantan (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) FAZIRA JALAN TELUK SISEK FAZIRAH ZUKI LORONG CHENGAL LEMPONG NORAZMA KUANTAN AZIZAH KUANTAN NANAD AZRI TAMAN BUKIT RANGIN NURULHANA SUNGAI ISAP ROHANI MUHAMAD JALAN AIR PUTIH (1 - 5) CIK TA JALAN INDUSTRI TANAH PUTIH BARU SAADIAH KAMPUNG BALOK SHARIFAH KUANTAN SITI KHADIJAH KAMPUNG TIRAM SITI NORHAMIZAH TAMAN SERI MAHKOTA AMAN SITI SHARINA KUANTAN ZALIFAH AHMAD BANDAR INDERA MAHKOTA Maran (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) MOHD FADELY BUKIT TAJAU Muadzam Shah (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) HASAZALI PAHANG Pekan (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) FASEEHA KAHAR PEKAN Raub (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) AZNAH TAMAN KEMAJUAN NOR SOLEHAH RAUB NUR SHA

##### 1.7.0.5 Generating negative pairs

Our plan below will be to use the right dataset as the source of the negative pair for the left dataset. This ought to ensure
very low relevance and hard negativity.

| Dataset to be formatted | Source dataset for negative pair |
| ------------------------|----------------------------------|  
| Wikipedia               |                          Hansard |
| Common Crawl            |                        Wikipedia |
| Hansard | Common Crawl |

**NOTE:** The row for the negative pair (in left dataset) will be randomly selected from the right dataset.

##### 1.7.1 wikipedia

In [37]:
# Save file
max_worker = 20

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_wikipedia), max_worker)):
        futures = {executor.submit(unpack_chatgptqa, t, data_hansard, True, False): t for t in data_wikipedia[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                for r in result:
                    with open('reformatted_dataset/reformatted-wikipedia-qa.jsonl', 'a') as final:
                            json.dump(r, final)
                            final.write('\n')

  0%|          | 0/2247 [00:00<?, ?it/s]

In [41]:
test_wikipedia = pd.read_json('reformatted_dataset/reformatted-wikipedia-qa.jsonl', lines=True)

test_wikipedia.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 658042 entries, 0 to 658041
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   query   658042 non-null  object
 1   text    658042 non-null  object
 2   label   658042 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 15.1+ MB


##### 1.7.2 common crawl

In [38]:
# Save file
max_worker = 20

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_common_crawl), max_worker)):
        futures = {executor.submit(unpack_chatgptqa, t, data_wikipedia, False, False): t for t in data_common_crawl[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                for r in result:
                    with open('reformatted_dataset/reformatted-common-crawl-qa.jsonl', 'a') as final:
                            json.dump(r, final)
                            final.write('\n')

  0%|          | 0/3492 [00:00<?, ?it/s]

In [43]:
test_common_crawl = pd.read_json('reformatted_dataset/reformatted-common-crawl-qa.jsonl', lines=True)

test_common_crawl.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 418978 entries, 0 to 418977
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   query   418978 non-null  object
 1   text    418978 non-null  object
 2   label   418978 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 9.6+ MB


In [44]:
test_common_crawl.head(4)

Unnamed: 0,query,text,label
0,Apakah produk yang ditawarkan oleh Camay?,Skip to content Home Product Info Menu Toggle ...,1
1,Apakah produk yang ditawarkan oleh Camay?,Tan Sri Dato' Seri Panglima Acryl Sani bin Haj...,0
2,Bagaimana cara membeli produk Camay?,Skip to content Home Product Info Menu Toggle ...,1
3,Bagaimana cara membeli produk Camay?,INPENS merupakan singkatan kepada Industri Pen...,0


##### 1.7.3 hansard

In [39]:
# Save file
max_worker = 20

with ThreadPoolExecutor(max_workers=max_worker) as executor:
    for _ in tqdm(range(0, len(data_hansard), max_worker)):
        futures = {executor.submit(unpack_chatgptqa, t, data_common_crawl, False, True): t for t in data_hansard[_: _ + max_worker]}

        for future in as_completed(futures):
            result = future.result()
            if result:
                for r in result:
                    with open('reformatted_dataset/reformatted-hansard-qa.jsonl', 'a') as final:
                            json.dump(r, final)
                            final.write('\n')

  0%|          | 0/2119 [00:00<?, ?it/s]

In [45]:
test_hansard = pd.read_json('reformatted_dataset/reformatted-hansard-qa.jsonl', lines=True)

test_hansard.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 254228 entries, 0 to 254227
Data columns (total 3 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   query   254228 non-null  object
 1   text    254228 non-null  object
 2   label   254228 non-null  int64 
dtypes: int64(1), object(2)
memory usage: 5.8+ MB


In [46]:
test_hansard.head(4)

Unnamed: 0,query,text,label
0,Apakah inisiatif dan tindakan agresif kementer...,\n12 ...,1
1,Apakah inisiatif dan tindakan agresif kementer...,Home Search Browse Browse By Body Type Sedan H...,0
2,Apakah senarai produk-produk minuman yang tela...,\n12 ...,1
3,Apakah senarai produk-produk minuman yang tela...,Courses Institution Essential Guide Scholarshi...,0


### 2.0 Convert to Mosaic

In [2]:
# Source: https://docs.mosaicml.com/projects/streaming/en/latest/fundamentals/dataset_conversion_guide.html

columns = {
    'query': 'str',
    'text': 'str',
    'label': 'int8',
}

compression = 'zstd'
hashes = 'sha1', 'xxh64'

We should use a linux command next time!

```
!cat file1.jsonl file2.jsonl file3.jsonl > combined.jsonl
```

In [8]:
folders = sorted(glob.glob('reformatted_dataset/reformatted-*.jsonl'))
folders

['reformatted_dataset/reformatted-common-crawl-qa.jsonl',
 'reformatted_dataset/reformatted-facebook.jsonl',
 'reformatted_dataset/reformatted-hansard-qa.jsonl',
 'reformatted_dataset/reformatted-iium-confession.jsonl',
 'reformatted_dataset/reformatted-mining-b-cari-com-my.jsonl',
 'reformatted_dataset/reformatted-mining-summarization.jsonl',
 'reformatted_dataset/reformatted-news.jsonl',
 'reformatted_dataset/reformatted-twitter.jsonl',
 'reformatted_dataset/reformatted-wikipedia-qa.jsonl']

In [26]:
with MDSWriter(out='mosaic-dataset-all', columns=columns, compression=None, hashes=hashes) as out:
    for no, f in enumerate(folders):
        print(no)
        empty_rows = 0
        with open(f) as fopen:
            for data in tqdm(fopen):
                data = json.loads(data)
                if (data == {} or data['query'] == None or data['text'] == None or data['label'] == None or data['query'] == "" or data['text'] == "" or data['label'] not in [0, 1]):
                    empty_rows += 1
                    continue
                out.write(data)
            
        print("No. of empty rows: ", empty_rows)
        print("")

0


0it [00:00, ?it/s]

No. of empty rows:  0

1


0it [00:00, ?it/s]

No. of empty rows:  0

2


0it [00:00, ?it/s]

No. of empty rows:  2

3


0it [00:00, ?it/s]

No. of empty rows:  0

4


0it [00:00, ?it/s]

No. of empty rows:  2732

5


0it [00:00, ?it/s]

No. of empty rows:  54053

6


0it [00:00, ?it/s]

No. of empty rows:  0

7


0it [00:00, ?it/s]

No. of empty rows:  1

8


0it [00:00, ?it/s]

No. of empty rows:  0



In [30]:
# testing
test_mosaic_dataset = LocalDataset(local="mosaic-dataset-all")
test_mosaic_dataset[2]

{'label': 1,
 'query': 'Bagaimana cara membeli produk Camay?',
 'text': 'Skip to content Home Product Info Menu Toggle Camay Repellent Repel Guard Plus+ Camay Foliar Fertilizer About Us Contact Us My account My Agent Menu Toggle Agents Listed Register Agent 0 Main Menu Home Product Info Menu Toggle Camay Repellent Repel Guard Plus+ Camay Foliar Fertilizer About Us Contact Us My account My Agent Menu Toggle Agents Listed Register Agent My Agent Nak beli produk Camay? Jom klik pada senarai ejen-ejen yang berdaftar dengan kami. Klik pada negeri pilihan anda,dan pilih ejen-ejen kegemaran anda mengikut kawasan pilihan anda. + Johor (35) Nama Lokasi Bandar Penawar (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) LAILI BANDAR PENAWAR Bandar Penawar, Kota Tinggi (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) NOR ALYDA BANDAR PENAWAR Batu Pahat (Klik nama untuk WhatsApp, klik lokasi untuk Google Map) KAMILAH BATU PAHAT NURAINI RENGIT NOR IZYANTI BATU PAHAT ZANITA BATU PAHAT Ba

In [32]:
# bapak dia!
test_mosaic_dataset.size

29304582

In [33]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         124G   37G   88G  30% /
tmpfs            64M     0   64M   0% /dev
/dev/sdc        984G  467G  517G  48% /home/ubuntu
/dev/sdd        9.8G  131M  9.7G   2% /dev/shm
/dev/root       124G   37G   88G  30% /etc/hosts
tmpfs           205G   12K  205G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           109G   12K  109G   1% /proc/driver/nvidia
tmpfs            44G   20M   44G   1% /run/nvidia-persistenced/socket
tmpfs           109G     0  109G   0% /proc/acpi
tmpfs           109G     0  109G   0% /proc/scsi
tmpfs           109G     0  109G   0% /sys/firmware
