In [1]:
from glob import glob
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import orjson as json
import random

In [2]:
data = []

with open('iium-confession-0.jsonl') as fopen:
    for x in tqdm(fopen):
        try:
            data.append(json.loads(x))
        except:
            pass

28180it [00:03, 8063.84it/s]


In [3]:
vectors, texts = [], []

for d in data:
    vectors.append(d['v'])
    texts.append(d['text'])

In [4]:
len(vectors)

28180

In [6]:
from scipy.spatial import KDTree
import numpy as np

In [7]:
concat = np.array(vectors)
concat.shape

(28180, 1536)

In [8]:
%%time

kd_tree = KDTree(concat, leafsize = 40)

CPU times: user 312 ms, sys: 10.5 ms, total: 322 ms
Wall time: 320 ms


In [9]:
%%time

dist, ind = kd_tree.query(concat[0], k=len(concat))

CPU times: user 42.7 ms, sys: 2.46 ms, total: 45.2 ms
Wall time: 43.9 ms


In [10]:
dist

array([0.        , 0.45700824, 0.4892435 , ..., 1.24501397, 1.25687754,
       1.26333354])

In [26]:
pos_indices = [k for k in ind[dist < 0.5]]
neg_indices = [k for k in ind[dist > 1]]
pos_indices

[0, 24639, 2460, 24066]

In [25]:
neg_indices

[]

In [21]:
import pickle
import os

class Pointer:
    def __init__(self, filename):
        self.filename = filename
        self.index = -1

    def _save(self):
        with open(self.filename, 'wb') as fopen:
            pickle.dump(self.index, fopen)

    def increment(self):
        self.index += 1
        self._save()

    def load(self):
        if not os.path.exists(self.filename):
            return
        with open(self.filename, 'rb') as fopen:
            self.index = pickle.load(fopen)

In [23]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/5aa5257608b61e8fcc828e99fbd070d5ca7358e3/mp.py
import mp

def dedup(strings):
    unique_neg = []
    elements = set()

    for n in strings:
        x_lower = n.lower()
        if x_lower not in elements:
            elements.add(x_lower)
            unique_neg.append(n)
    return unique_neg

dedup(['a', 'a'])

['a']

In [24]:
!rm -rf iium-confession-hard
!mkdir iium-confession-hard

In [29]:
lower_bound = 0.5
upper_bound = 1

def loop(data):
    data, index = data
    filename = f'iium-confession-hard/{index}.jsonl'
    fopen = open(filename, 'a')
    pointer = Pointer(f'{filename}.pickle')
    pointer.load()
    for n in tqdm(range(len(data))):
        x = data[n]
        if n > pointer.index:
            dist, ind = kd_tree.query(concat[x], k=len(concat))

            query = texts[x]

            pos_indices = [k for k in ind[dist < lower_bound]]
            neg_indices = [k for k in ind[dist > upper_bound]]
            
            if len(pos_indices) > 6:
                pos_indices = random.sample(pos_indices,6)
            if len(neg_indices) > 5:
                neg_indices = random.sample(neg_indices,5)

            pos = [texts[i] for i in pos_indices if texts[i] != query and len(texts[i]) > 1]
            pos = dedup(pos)

            if len(pos) == 0:
                continue

            neg = [texts[i] for i in neg_indices if texts[i] != query and len(texts[i]) > 1]
            neg = dedup(neg)

            if len(neg) == 0:
                continue


            d = {'query':query,'pos':pos,'neg':neg}
            fopen.write(f'{json.dumps(d).decode()}\n')
            fopen.flush()
            pointer.index = n
            pointer._save()

In [30]:
import mp

mp.multiprocessing(range(len(data)), loop, cores = 20, returned = False)

100%|██████████| 1409/1409 [05:39<00:00,  4.14it/s]
100%|██████████| 1409/1409 [05:50<00:00,  4.01it/s]
100%|██████████| 1409/1409 [05:52<00:00,  4.00it/s]
100%|██████████| 1409/1409 [06:01<00:00,  3.90it/s]
100%|██████████| 1409/1409 [06:07<00:00,  3.84it/s]
100%|██████████| 1409/1409 [06:08<00:00,  3.82it/s]
100%|██████████| 1409/1409 [06:10<00:00,  3.81it/s]
100%|██████████| 1409/1409 [06:18<00:00,  3.72it/s]
100%|██████████| 1409/1409 [06:19<00:00,  3.71it/s]
100%|██████████| 1409/1409 [06:22<00:00,  3.68it/s]
100%|██████████| 1409/1409 [06:25<00:00,  3.66it/s]
100%|██████████| 1409/1409 [06:25<00:00,  3.66it/s]
100%|██████████| 1409/1409 [06:25<00:00,  3.65it/s]
100%|██████████| 1409/1409 [06:32<00:00,  3.59it/s]
100%|██████████| 1409/1409 [06:32<00:00,  3.59it/s]
100%|██████████| 1409/1409 [06:40<00:00,  3.51it/s]
100%|██████████| 1409/1409 [06:44<00:00,  3.49it/s]
100%|██████████| 1409/1409 [06:47<00:00,  3.46it/s]
100%|██████████| 1409/1409 [06:56<00:00,  3.38it/s]
100%|███████

In [33]:
files = glob('iium-confession-hard/*.jsonl')

with open('iium-confession-hard.jsonl', 'w') as fopen_l:
    for f in files:
        with open(f, encoding = "ISO-8859-1") as fopen:
            try:
                for l in tqdm(fopen):
                    l = json.loads(l)
                    if not len(l['query'].strip()):
                        continue
                    if not len(l['pos']):
                        continue
                    if not len(l['neg']):
                        continue
                    fopen_l.write(f'{json.dumps(l)}\n')
            except Exception as e:
                print(e)
                pass

1149it [00:00, 2809.65it/s]
1128it [00:00, 2870.68it/s]
1137it [00:00, 2769.17it/s]
1143it [00:00, 2750.72it/s]
1141it [00:00, 2911.95it/s]
1143it [00:00, 2947.81it/s]
1125it [00:00, 2768.98it/s]
1126it [00:00, 2739.60it/s]
1132it [00:00, 2759.68it/s]
1125it [00:00, 2858.33it/s]
1131it [00:00, 2867.24it/s]
1129it [00:00, 2811.32it/s]
1109it [00:00, 2829.55it/s]
1123it [00:00, 2927.49it/s]
1152it [00:00, 3028.86it/s]
1143it [00:00, 2911.23it/s]
1145it [00:00, 2862.38it/s]
1123it [00:00, 2862.47it/s]
1133it [00:00, 2759.46it/s]
1120it [00:00, 2963.26it/s]


In [34]:
!wc -l iium-confession-hard.jsonl

22657 iium-confession-hard.jsonl
