### Mining dataset Using bge-large-en

In [1]:
from glob import glob
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import orjson as json

### Load json file containing embedding & texts

In [2]:
data = []


with open('bge-embedding/bge-large-en-embedding/b.cari.com.my.jsonl') as fopen:
    for x in tqdm(fopen):
        try:
            data.append(json.loads(x))

        except:
            pass

631723it [01:44, 6050.41it/s] 


In [3]:
len(data)

631723

**Dataset Format**

- v: contains the embedding vectors with 1024 dimension generated from baai/bge-large-en
- text:
    - 'left' : original text
    - 'en': translated english text
    - 'ms' : translated malay text

In [4]:
data[1]

{'text': {'left': 'Apabila berbicara mengenai kahwin, jgn pernah terburu-buru. Fikir dulu masak-masak. Dan banyakkan berbincang dgn pasangan.',
  'en': 'When it comes to marriage, never rush. Think it through carefully. And have plenty of discussions with your partner.',
  'ms': 'When talking about marriage, never be in a hurry. Think it over carefully. And have many discussions with your partner.'},
 'v': [-0.07691702246665955,
  -0.20879215002059937,
  -0.17237505316734314,
  0.06482794880867004,
  -0.8752002120018005,
  -0.5508242845535278,
  -0.09948401153087616,
  0.2595939338207245,
  0.23982754349708557,
  -0.012539883144199848,
  0.7432616949081421,
  -0.45767995715141296,
  0.1600823551416397,
  -0.589044451713562,
  -0.3139093518257141,
  0.17132817208766937,
  -0.620173990726471,
  -1.181017279624939,
  -0.39189186692237854,
  0.2889060974121094,
  0.20220234990119934,
  0.13703162968158722,
  -1.1252038478851318,
  -0.27330130338668823,
  -0.270694375038147,
  1.62221860885

### Scipy KDTree for Computing Distance & Fast Nearest Neighbor Lookup

In [5]:
vectors, texts = [], []

for d in data:
    vectors.append(d['v'])
    texts.append(d['text'])

In [6]:
from scipy.spatial import KDTree
import numpy as np

In [7]:
concat = np.array(vectors)
concat.shape

(631723, 1024)

In [8]:
%%time

kd_tree = KDTree(concat, leafsize = 40)

CPU times: user 10.7 s, sys: 155 ms, total: 10.9 s
Wall time: 10.9 s


### Generating Train Dataset

we will generate three row for each index text:

{'query': text for one key , 'neg': negative pair for each text index for key ['left','en','ms'], 'pos': positive pair text and query text for text in key ['left','en','ms'] except the key used in the query]}


- Positive text pair threshold: text distance <= 10
- Negative text pair threshold: text distance > 15

In [9]:
keys = list(texts[0].keys())

In [10]:
texts[0].keys()

dict_keys(['left', 'en', 'ms'])

In [11]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py
import mp

def dedup(strings):
    unique_neg = []
    elements = set()

    for n in strings:
        x_lower = n.lower()
        if x_lower not in elements:
            elements.add(x_lower)
            unique_neg.append(n)
    return unique_neg

dedup(['a', 'a'])

['a']

In [16]:
import time

lower_bound = 8
upper_bound = 15

def loop(data):
    data, index = data
    with open(f'./b.cari.com.my/b.cari.com.my-train-dataset-{index}.jsonl', 'w') as fopen:
        for x in tqdm(data):
            dist, ind = kd_tree.query(concat[x], k=len(concat), workers = 1)

            for key in keys:

                query = texts[x][key]

                pos_indices = [k for k in ind[dist <= lower_bound]]
                neg_indices = [k for k in ind[dist > upper_bound]]

                if len(neg_indices) > 5:
                    neg_indices = random.sample(neg_indices,5)
                if len(pos_indices) > 5:
                    rand_pos = random.sample(pos_indices,4)
                    rand_pos.append(x)
                    pos_indices = rand_pos


                pos = [value for i in pos_indices for value in texts[i].values() if value != query ]

                neg = [value for i in neg_indices for value in texts[i].values()]
            
                d = {'query':query,'pos':dedup(pos),'neg':dedup(neg)}
                fopen.write(f'{json.dumps(d).decode()}\n')
                fopen.flush()

In [15]:
loop((range(10), 0))

100%|██████████| 10/10 [00:11<00:00,  1.19s/it]


In [None]:
import mp

mp.multiprocessing(range(len(data)), loop, cores = 15, returned = False)

  6%|▌         | 2359/42114 [1:00:59<18:27:56,  1.67s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

  7%|▋         | 2819/42114 [1:14:23<16:17:47,  1.49s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 28%|██▊       | 11649/42114 [4:58:20<13:12:39,  1.56s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`

In [1]:
import orjson as json
import glob

files = []
for file in glob.glob("./b.cari.com.my/*.jsonl"):
    print(file)
    files.append(file)

./b.cari.com.my/b.cari.com.my-train-dataset-13.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-7.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-4.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-2.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-9.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-3.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-10.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-15.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-0.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-5.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-11.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-14.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-12.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-1.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-6.jsonl
./b.cari.com.my/b.cari.com.my-train-dataset-8.jsonl


In [4]:
import json

In [5]:
output_file = "b.cari.com.my-train-dataset-bge.jsonl"

# Open the output file for writing
with open(output_file, "w") as combined_file:
    # Iterate through each input JSONL file
    for jsonl_file in files:
        # Open the input JSONL file for reading
        with open(jsonl_file, "r") as infile:
            # Iterate through each line in the input file
            for line in infile:
                # Parse the JSON data from the line
                data = json.loads(line)
                # Write the JSON data to the output file
                json.dump(data, combined_file)
                combined_file.write('\n')

In [None]:
!mv b.cari.com.my-train-dataset-bge.jsonl ../

In [2]:
import json

with open('hansard/hansard-train-dataset-29.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        break

In [4]:
l['query']

'\nDR. 28.2.2023 6 \n\n \nTimbalan Perdana Menteri tetapi semasa Yang Amat Berhormat Pekan menjadi Perdana \n\nMenteri. Menteri yang bertanggungjawab dan saya bekerjasama pada masa itu. \n\n Saya hendak rakamkan bahawa ada dua orang Ahli Parlimen sekarang yang dahulu \n\nbelum menjadi Ahli Parlimen tetapi terlibat sama dalam proses itu, Yang Berhormat \n\nSetiawangsa dan Yang Berhormat Bangi. Juga, bekas Ahli Parlimen Puan Nurul Izzah Anwar \n\ndan Encik Tony Pua dan Presiden Bar Council pada masa itu, sekarang Dato’ Lee Chin Wei. \n\nSoalannya ialah dalam manifesto Pakatan Harapan, AUKU hendak dimansuhkan tetapi \n\nsahabat saya Yang Berhormat Menteri KPT menjawab sedikit berbeza. Ini low-hanging \n\nfruitlah yang senang, yang seharusnya dipinda dengan segera sebab kerja-kerja \n\nmenggantikan AUKU yang sekarang ini pun Yang Berhormat Menteri faham perkara itu pun \n\nsudah dimulakan pada zaman beliau dahulu lagi. \n\n Tuan Yang di-Pertua:  Yang Berhormat Indera Mahkota, soalannya ial