### Mining dataset Using bge-large-en

In [1]:
# !pip3 install orjson

In [2]:
from glob import glob
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import orjson as json

### Load json file containing embedding & texts

In [3]:
data = []


with open('bge-embedding/bge-large-en-embedding/twitter.jsonl') as fopen:
    for x in tqdm(fopen):
        try:
            data.append(json.loads(x))

        except:
            pass

691012it [01:20, 8548.87it/s] 


In [4]:
len(data)

691012

**Dataset Format**

- v: contains the embedding vectors with 1024 dimension generated from baai/bge-large-en
- text:
    - 'left' : original text
    - 'en': translated english text
    - 'ms' : translated malay text

In [5]:
data[1]

{'text': {'left': 'I thought alor setar nya da cukup sedih.. kelantan nya peh ... Kedah n kelantan nya MB mana?',
  'en': 'I thought Alor Setar was already quite sad.. Kelantan is worse... Who is the MB for Kedah and Kelantan?',
  'ms': 'Saya fikir Alor Setar sudah cukup sedih.. Kelantan lagi teruk... Siapa MB untuk Kedah dan Kelantan?'},
 'v': [0.13888011872768402,
  -0.47501182556152344,
  0.40516266226768494,
  0.16995717585086823,
  -0.6713352203369141,
  -1.0514363050460815,
  0.10906267166137695,
  0.34273001551628113,
  0.152670755982399,
  -0.2184249609708786,
  0.42360803484916687,
  -0.836317241191864,
  -0.4082200825214386,
  -0.32670679688453674,
  -0.6629812717437744,
  -0.3168671429157257,
  -0.37673184275627136,
  0.008131159469485283,
  -0.31306037306785583,
  0.4005916118621826,
  0.9667572379112244,
  -0.17604903876781464,
  -0.9339383244514465,
  -0.38919058442115784,
  -0.27883780002593994,
  0.4602336883544922,
  0.27048981189727783,
  0.42183181643486023,
  1.1284

### Scipy KDTree for Computing Distance & Fast Nearest Neighbor Lookup

In [6]:
vectors, texts = [], []

for d in data:
    vectors.append(d['v'])
    texts.append(d['text'])

In [7]:
from scipy.spatial import KDTree
import numpy as np

In [8]:
concat = np.array(vectors)
concat.shape

(691012, 1024)

In [9]:
%%time

kd_tree = KDTree(concat, leafsize = 40)

CPU times: user 8.35 s, sys: 80.3 ms, total: 8.43 s
Wall time: 8.43 s


### Generating Train Dataset

we will generate three row for each index text:

{'query': text for one key , 'neg': negative pair for each text index for key ['left','en','ms'], 'pos': positive pair text and query text for text in key ['left','en','ms'] except the key used in the query]}


- Positive text pair threshold: text distance <= 10
- Negative text pair threshold: text distance > 15

In [10]:
keys = list(texts[0].keys())

In [11]:
texts[0].keys()

dict_keys(['left', 'en', 'ms'])

In [12]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py
import mp

def dedup(strings):
    unique_neg = []
    elements = set()

    for n in strings:
        x_lower = n.lower()
        if x_lower not in elements:
            elements.add(x_lower)
            unique_neg.append(n)
    return unique_neg

dedup(['a', 'a'])

['a']

In [13]:
import time

lower_bound = 8
upper_bound = 15

def loop(data):
    data, index = data
    with open(f'twitter-train-dataset-{index}.jsonl', 'w') as fopen:
        for x in tqdm(data):
            dist, ind = kd_tree.query(concat[x], k=len(concat), workers = 1)

            for key in keys:

                query = texts[x][key]

                pos_indices = [k for k in ind[dist <= lower_bound]]
                neg_indices = [k for k in ind[dist > upper_bound]]

                if len(neg_indices) > 5:
                    neg_indices = random.sample(neg_indices,5)
                if len(pos_indices) > 5:
                    rand_pos = random.sample(pos_indices,4)
                    rand_pos.append(x)
                    pos_indices = rand_pos


                pos = [value for i in pos_indices for value in texts[i].values() if value != query ]

                neg = [value for i in neg_indices for value in texts[i].values()]
            
                d = {'query':query,'pos':dedup(pos),'neg':dedup(neg)}
                fopen.write(f'{json.dumps(d).decode()}\n')
                fopen.flush()

In [14]:
loop((range(10), 0))

100%|██████████| 10/10 [00:09<00:00,  1.04it/s]


In [15]:
import mp

mp.multiprocessing(range(len(data)), loop, cores = 30, returned = False)

  5%|▌         | 1235/23033 [1:04:06<14:40:37,  2.42s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 20%|██        | 4648/23033 [4:13:59<15:07:06,  2.96s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 22%|██▏       | 5062/23033 [4:31:04<17:00:06,  3.41s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`-