### Mining dataset Using bge-large-en

In [1]:
from glob import glob
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import json
import random

### Load json file containing embedding & texts

In [None]:
data = []


with open('bge-embedding/bge-large-en-embedding/facebook.jsonl') as fopen:
    for x in tqdm(fopen):
        try:
            data.append(json.loads(x))

        except:
            pass

In [None]:
len(data)

**Dataset Format**

- v: contains the embedding vectors with 1024 dimension generated from baai/bge-large-en
- text:
    - 'left' : original text
    - 'en': translated english text
    - 'ms' : translated malay text

In [None]:
data[1]

### Scipy KDTree for Computing Distance & Fast Nearest Neighbor Lookup

In [None]:
vectors, texts = [], []

for d in data:
    vectors.append(d['v'])
    texts.append(d['text'])

In [None]:
from scipy.spatial import KDTree
import numpy as np

In [None]:
concat = np.array(vectors)
concat.shape

In [None]:
%%time

kd_tree = KDTree(concat, leafsize = 40)

In [None]:
concat[0]

In [None]:
def convert_int64(obj):
    if isinstance(obj, np.int64):
        return int(obj)
    raise TypeError

### Generating Train Dataset

we will generate three row for each index text:

{'query': text for one key , 'neg': negative pair for each text index for key ['left','en','ms'], 'pos': positive pair text and query text for text in key ['left','en','ms'] except the key used in the query]}


- Positive text pair threshold: text distance <= 10
- Negative text pair threshold: text distance > 15

In [None]:
batch_size = 10_000
keys = list(texts[0].keys())

In [None]:
import time

In [None]:
def compute_neighbors(x):
    
    result = []
        
    dist, ind = kd_tree.query(concat[x], k=len(concat), workers = 100)
    

    for key in keys :

        query = texts[x][key]

        pos_indices = [k for k in ind[dist <= 10]]
        neg_indices = [k for k in ind[dist > 15]]

        if len(neg_indices) > 5:
            neg_indices = random.sample(neg_indices,5)
        if len(pos_indices) > 5:
            rand_pos = random.sample(pos_indices,4)
            rand_pos.append(x)
            pos_indices = rand_pos


        pos = [value for i in pos_indices for value in texts[i].values() if value != query ]

        neg = [value for i in neg_indices for value in texts[i].values()]
        
#         - check duplicate 
#         - by lowering text and same duplicate

        unique_neg = []
        elements = set()

        for n in neg:
            x_lower = n.lower()
            if x_lower not in elements:
                elements.add(x_lower)
                unique_neg.append(n)

        result.append({'query':query,'pos':pos,'neg':unique_neg})
        
        
    return result



In [None]:
for l in range(0,len(concat),batch_size):

    with ThreadPoolExecutor(max_workers=10) as executor: 
    
        futures = [executor.submit(compute_neighbors, x+l) for x in range(len(concat[l:l+batch_size]))]

        for future in tqdm(futures):
            result = future.result()
            if result:
                for x in result:
                    with open('facebook-train-dataset.jsonl', 'a') as f:
                        json.dump(x,f, default = convert_int64)
                        f.write('\n')