### Mining dataset Using bge-large-en

In [1]:
# !pip3 install orjson

In [2]:
import os
import pickle
from glob import glob
# import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import random
import orjson as json
import numpy as np
from scipy.spatial import KDTree
import seaborn as sns

In [3]:
class Pointer:
    def __init__(self, filename):
        self.filename = filename
        self.index = 0

    def _save(self):
        with open(self.filename, 'wb') as fopen:
            pickle.dump(self.index, fopen)

    def increment(self):
        self.index += 1
        self._save()

    def load(self):
        if not os.path.exists(self.filename):
            return
        with open(self.filename, 'rb') as fopen:
            self.index = pickle.load(fopen)

### Load json file containing embedding & texts

In [4]:
data = []


with open('bge-embedding/bge-large-en-embedding/c.cari.com.my.jsonl') as fopen:
    for x in tqdm(fopen):
        try:
            data.append(json.loads(x))

        except:
            pass

737977it [01:34, 7836.06it/s] 


In [5]:
len(data)

737977

In [6]:
data[0]

{'text': {'left': '就简单一条路, 上tokun山咯..哈哈哈..\ntamiya 发表于 24-5-2012 07:50 PM  要人命咩？',
  'en': 'Just a simple road, go up to Mount Tokun.. hahaha.. tamiya posted on 24-5-2012 07:50 PM Is it dangerous?',
  'ms': 'Hanya jalan yang mudah, naik ke Gunung Tokun.. hahaha.. tamiya menghantar pada 24-5-2012 07:50 PM Adakah ia berbahaya?'},
 'v': [-0.21345102787017822,
  0.696921706199646,
  -0.0007770132506266236,
  0.26271912455558777,
  -0.9448431730270386,
  -0.8142252564430237,
  0.1264476329088211,
  0.7851533889770508,
  0.7293431758880615,
  0.3283812403678894,
  0.9870699048042297,
  0.09791141748428345,
  0.4052395820617676,
  -0.8279939889907837,
  -0.4150835871696472,
  0.26475656032562256,
  -0.23079133033752441,
  -0.543842613697052,
  -0.13258950412273407,
  0.47374197840690613,
  0.24937523901462555,
  -0.4776596426963806,
  -1.389888048171997,
  -0.6096712946891785,
  -0.4612103998661041,
  0.0845121219754219,
  0.897865355014801,
  0.12646779417991638,
  0.9993556141853333,
  1.32

**Dataset Format**

- v: contains the embedding vectors with 1024 dimension generated from baai/bge-large-en
- text:
    - 'left' : original text
    - 'en': translated english text
    - 'ms' : translated malay text

In [7]:
data[1]

{'text': {'left': '本帖最后由 levis69 于 26-5-2012 10:33 PM 编辑  保持的很好阿，lz几岁了。。。两个是lz的孩子吗\n文雯 发表于 4-3-2012 03:37 PM  谢你的支持。是的，不年轻了',
  'en': "This post was last edited by levis69 on 26-5-2012 at 10:33 PM. You're doing great, how old are you, lz? Are those two your children?",
  'ms': 'Pos ini diedit terakhir oleh levis69 pada 26-5-2012 jam 10:33 malam. Anda berjaga dengan baik, berapa umur anda, lz? Adakah kedua-duanya anak anda?'},
 'v': [0.3147364854812622,
  0.049950920045375824,
  -0.17954352498054504,
  0.9019356966018677,
  -0.7601484060287476,
  -0.7046667337417603,
  -0.05164099112153053,
  0.6316915154457092,
  0.21544410288333893,
  0.22399461269378662,
  -0.33199557662010193,
  0.07661987841129303,
  0.059526268392801285,
  -0.27398717403411865,
  -0.4417124390602112,
  0.23922640085220337,
  -0.24657659232616425,
  -0.07777870446443558,
  -0.2128477543592453,
  0.4317103922367096,
  0.5941421389579773,
  0.4977310597896576,
  -0.9081755876541138,
  0.33235716819763184,
  -0.133705

### Scipy KDTree for Computing Distance & Fast Nearest Neighbor Lookup

In [8]:
vectors, texts = [], []

for d in data:
    vectors.append(d['v'])
    texts.append(d['text'])

In [9]:
from scipy.spatial import KDTree
import numpy as np

In [10]:
concat = np.array(vectors)
concat.shape

(737977, 1024)

In [11]:
%%time

kd_tree = KDTree(concat, leafsize = 40)

CPU times: user 11.9 s, sys: 186 ms, total: 12.1 s
Wall time: 12.1 s


In [12]:
%%time

# dist, ind = kd_tree.query(concat[:1000], k=len(concat), workers = 1000)

CPU times: user 5 µs, sys: 1e+03 ns, total: 6 µs
Wall time: 10.7 µs


In [13]:
# sns.distplot(dist)

In [14]:
# dist[dist > 17.6].shape

In [15]:
# dist[dist < 10.5].shape

### Generating Train Dataset

we will generate three row for each index text:

{'query': text for one key , 'neg': negative pair for each text index for key ['left','en','ms'], 'pos': positive pair text and query text for text in key ['left','en','ms'] except the key used in the query]}


- Positive text pair threshold: text distance <= 10
- Negative text pair threshold: text distance > 15

In [16]:
keys = list(texts[0].keys())

In [17]:
keys

['left', 'en', 'ms']

In [18]:
texts[0].keys()

dict_keys(['left', 'en', 'ms'])

In [19]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py
import mp

def dedup(strings):
    unique_neg = []
    elements = set()

    for n in strings:
        x_lower = n.lower()
        if x_lower not in elements:
            elements.add(x_lower)
            unique_neg.append(n)
    return unique_neg

dedup(['a', 'a'])

['a']

In [23]:
import time

lower_bound = 10.5
upper_bound = 17.6

def loop(data):
    data, index = data
    filename = f'c.cari/c.cari-train-dataset-{index}.jsonl'
    fopen = open(filename, 'a')
    pointer = Pointer(f'{filename}.pickle')
    pointer.load()
    for i, x in tqdm(enumerate(data)):
        if i > pointer.index:
            dist, ind = kd_tree.query(concat[x], k=len(concat), workers = 1)

            for key in keys:

                query = texts[x][key]

                pos_indices = [k for k in ind[dist <= lower_bound]]
                neg_indices = [k for k in ind[dist > upper_bound]]

                if len(neg_indices) > 5:
                    neg_indices = random.sample(neg_indices,5)
                if len(pos_indices) > 5:
                    rand_pos = random.sample(pos_indices,4)
                    rand_pos.append(x)
                    pos_indices = rand_pos


                pos = [value for i in pos_indices for value in texts[i].values() if value != query ]

                if len(dedup(pos)) == 0:
                    continue

                neg = [value for i in neg_indices for value in texts[i].values()]

                if len(dedup(neg)) == 0:
                    continue


                d = {'query':query,'pos':dedup(pos),'neg':dedup(neg)}
                fopen.write(f'{json.dumps(d).decode()}\n')
                fopen.flush()
                pointer.index = i
                pointer._save()

In [24]:
!rm c.cari/*

In [25]:
loop((range(10), 0))

10it [00:12,  1.23s/it]


In [None]:
import mp

mp.multiprocessing(range(len(data)), loop, cores = 50, returned = False)

6138it [6:27:21,  4.24s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

11798it [12:19:31,  3.51s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

12070it [12:38:12,  4.10s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=

In [None]:
file_path = 'c.cari/c.cari-train-dataset-40.jsonl.pickle'

try:
    with open(file_path, 'rb') as file:
        test = pickle.load(file)
        print("Contents of the pickle file:")
        print(test)
except Exception as e:
    print(f"An error occurred: {e}")