### Mining dataset Using bge-large-en

In [1]:
from glob import glob
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import orjson as json
import pickle
import os

class Pointer:
    def __init__(self, filename):
        self.filename = filename
        self.index = 0

    def _save(self):
        with open(self.filename, 'wb') as fopen:
            pickle.dump(self.index, fopen)

    def increment(self):
        self.index += 1
        self._save()

    def load(self):
        if not os.path.exists(self.filename):
            return
        with open(self.filename, 'rb') as fopen:
            self.index = pickle.load(fopen)

### Load json file containing embedding & texts

In [2]:
data = []

with open('bge-embedding/bge-large-en-embedding/manglish.jsonl') as fopen:
    for x in tqdm(fopen):
        try:
            data.append(json.loads(x))

        except:
            pass

1036338it [03:09, 5470.74it/s] 


In [3]:
len(data)

1036338

**Dataset Format**

- v: contains the embedding vectors with 1024 dimension generated from baai/bge-large-en
- text:
    - 'left' : original text
    - 'en': translated english text
    - 'ms' : translated malay text

In [4]:
data[1]

{'text': {'left': 'You are welcome to sniff her washed undies and show us a pic of it as proof of how un-intimate it is.',
  'en': 'You are welcome to smell her laundered underwear and provide us with a picture as evidence of how non-intimate it is.',
  'ms': 'Anda dialu-alukan untuk menghidu seluar dalam yang dicuci dan menunjukkan gambar sebagai bukti bahawa ia tidak intim.'},
 'v': [0.06512866169214249,
  -0.19405974447727203,
  0.2718108594417572,
  -0.022462619468569756,
  -0.6758020520210266,
  -0.616482138633728,
  -0.9126758575439453,
  -0.08668608963489532,
  -0.24268391728401184,
  -0.06757412850856781,
  0.735045850276947,
  -0.2121828943490982,
  0.3912808299064636,
  -0.09131388366222382,
  -1.1493905782699585,
  0.09065014868974686,
  -0.18948860466480255,
  -0.529503345489502,
  0.2737756073474884,
  -0.10617012530565262,
  0.6341321468353271,
  0.3843379020690918,
  -1.3839341402053833,
  0.19346211850643158,
  -0.10952679067850113,
  0.7198636531829834,
  0.71870177984

### Scipy KDTree for Computing Distance & Fast Nearest Neighbor Lookup

In [5]:
vectors, texts = [], []

for d in data:
    vectors.append(d['v'])
    texts.append(d['text'])

In [6]:
from scipy.spatial import KDTree
import numpy as np

In [7]:
concat = np.array(vectors)
concat.shape

(1036338, 1024)

In [8]:
%%time

kd_tree = KDTree(concat, leafsize = 40)

CPU times: user 23.2 s, sys: 29.8 s, total: 53 s
Wall time: 53.8 s


### Generating Train Dataset

we will generate three row for each index text:

{'query': text for one key , 'neg': negative pair for each text index for key ['left','en','ms'], 'pos': positive pair text and query text for text in key ['left','en','ms'] except the key used in the query]}


- Positive text pair threshold: text distance <= 10
- Negative text pair threshold: text distance > 15

In [9]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py
import mp

def dedup(strings):
    unique_neg = []
    elements = set()

    for n in strings:
        x_lower = n.lower()
        if x_lower not in elements:
            elements.add(x_lower)
            unique_neg.append(n)
    return unique_neg

dedup(['a', 'a'])

['a']

In [10]:
!mkdir manglish

mkdir: cannot create directory ‘manglish’: File exists


In [11]:
keys = list(texts[0].keys())

In [12]:
import time

lower_bound = 10
upper_bound = 15

def loop(data):
    data, index = data
    filename = f'./manglish/manglish-train-dataset-{index}.jsonl'
    pointer = Pointer(f'{filename}.pickle')
    pointer.load()
    with open(filename, 'w') as fopen:
        for i, x in tqdm(enumerate(data)):
            if i > pointer.index:
                dist, ind = kd_tree.query(concat[x], k=len(concat), workers = 1)

                for key in keys:

                    query = texts[x][key]

                    pos_indices = [k for k in ind[dist <= lower_bound]]
                    neg_indices = [k for k in ind[dist > upper_bound]]

                    if len(neg_indices) > 5:
                        neg_indices = random.sample(neg_indices,5)
                    if len(pos_indices) > 5:
                        rand_pos = random.sample(pos_indices,4)
                        rand_pos.append(x)
                        pos_indices = rand_pos


                    pos = [value for i in pos_indices for value in texts[i].values() if value != query ]

                    neg = [value for i in neg_indices for value in texts[i].values()]

                    d = {'query':query,'pos':dedup(pos),'neg':dedup(neg)}
                    fopen.write(f'{json.dumps(d).decode()}\n')
                    fopen.flush()
                    pointer.index = i
                    pointer._save()

In [13]:
!rm manglish/*.*

In [14]:
# loop((range(10), 0))

In [15]:
import mp

mp.multiprocessing(range(len(data)), loop, cores = 15, returned = False)

5858it [4:25:15,  2.31s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

12852it [9:15:16,  2.23s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

13225it [9:27:20,  2.26s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=10