In [22]:
import json
import random
import os
from tqdm import tqdm
from glob import glob
import re

def clean(string):
    string = re.sub('[^A-Za-z ]+', ' ', string.lower())
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

def overlap(string1, string2):
    l = set([w for w in clean(string1).split() if len(w) > 2])
    r = set([w for w in clean(string2).split() if len(w) > 2])
    return len(l & r) / len(l)

In [38]:
files = glob('/home/husein/ssd3/gov.my/ultrachat-*.jsonl')
files = [f for f in files if 'glaive_coder_raw_text' not in f and 'lom' not in f and 'crossref' not in f]
files

['/home/husein/ssd3/gov.my/ultrachat-jurnaldbp.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-jurnaldbp-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-maktabahalbakri.com.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-gov.my.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-astroawani-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-textbooks.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-epenerbitan-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-hansard-malay.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-ms-wikipedia.jsonl',
 '/home/husein/ssd3/gov.my/ultrachat-muftiwp.gov.my.texts.jsonl']

In [39]:
for f in files:
    new_f = f'pair-{os.path.split(f)[-1]}'
    print(f, new_f)
    data = []
    with open(f) as fopen:
        for l in fopen:
            data.append(json.loads(l))
    
    indices = {i for i in range(len(data))}
    pairs = []
    for i in tqdm(range(len(data))):
        l = data[i]
        context = l[0]['content']
            
        l = l[1:]
        inputs = []
        for no, r in enumerate(l):
            if r['content'] is None:
                break

            inputs.append(r['content'])

        if len(inputs) % 2 != 0:
            inputs = inputs[:-1]

        for k in range(0, len(inputs), 2):
            
            sampled = random.sample(list(indices - {i}), 100)
            neg = []
            for n in sampled:
                try:
                    overlapped = overlap(inputs[k], data[n][0]['content'])
                    if overlapped < 0.2:
                        neg.append(data[n][0]['content'])
                    if len(neg) >= 3:
                        break
                except Exception as e:
                    print(e)
                    pass

            pairs.append({
                'query': inputs[k],
                'pos': [context, inputs[k + 1]],
                'neg': neg
            })
            
    with open(new_f, 'w') as fopen:
        for l in pairs:
            if not len(l['pos']) or not len(l['neg']):
                continue
            fopen.write(f'{json.dumps(l)}\n')

/home/husein/ssd3/gov.my/ultrachat-jurnaldbp.jsonl pair-ultrachat-jurnaldbp.jsonl


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1734/1734 [00:41<00:00, 41.84it/s]


/home/husein/ssd3/gov.my/ultrachat-jurnaldbp-malay.jsonl pair-ultrachat-jurnaldbp-malay.jsonl


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 6440/6440 [01:17<00:00, 83.07it/s]


/home/husein/ssd3/gov.my/ultrachat-maktabahalbakri.com.jsonl pair-ultrachat-maktabahalbakri.com.jsonl


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3350/3350 [04:59<00:00, 11.18it/s]


/home/husein/ssd3/gov.my/ultrachat-gov.my.jsonl pair-ultrachat-gov.my.jsonl


 72%|███████████████████████████████████████████████████████████████████████████████▎                              | 7298/10128 [00:27<00:10, 277.67it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 72%|███████████████████████████████████████████████████████████████████████████████▌                              | 7326/10128 [00:27<00:17, 155.78it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 73%|███████████████████████████████████████████████████████████████████████████████▊                              | 7348/10128 [00:28<00:26, 104.77it/s]


division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by z

 73%|████████████████████████████████████████████████████████████████████████████████▎                             | 7391/10128 [00:28<00:20, 135.07it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10128/10128 [00:38<00:00, 262.37it/s]


/home/husein/ssd3/gov.my/ultrachat-astroawani-malay.jsonl pair-ultrachat-astroawani-malay.jsonl


100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 60198/60198 [03:52<00:00, 258.91it/s]


/home/husein/ssd3/gov.my/ultrachat-textbooks.jsonl pair-ultrachat-textbooks.jsonl


 31%|██████████████████████████████████▎                                                                          | 15684/49842 [01:27<03:20, 170.16it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 49842/49842 [04:37<00:00, 179.53it/s]


/home/husein/ssd3/gov.my/ultrachat-epenerbitan-malay.jsonl pair-ultrachat-epenerbitan-malay.jsonl


 18%|████████████████████                                                                                            | 818/4567 [00:06<00:31, 119.15it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 25%|███████████████████████████▎                                                                                   | 1122/4567 [00:09<00:26, 127.92it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 27%|█████████████████████████████▊                                                                                  | 1215/4567 [00:09<00:34, 98.03it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 27%|██████████████████████████████                                                                                  | 1226/4567 [00:10<00:45, 73.39it/s]


division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by z

 52%|█████████████████████████████████████████████████████████▋                                                     | 2375/4567 [00:19<00:18, 120.27it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 74%|██████████████████████████████████████████████████████████████████████████████████▍                            | 3390/4567 [00:27<00:09, 122.33it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 79%|███████████████████████████████████████████████████████████████████████████████████████▍                       | 3595/4567 [00:28<00:08, 118.35it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

 85%|██████████████████████████████████████████████████████████████████████████████████████████████▍                | 3886/4567 [00:31<00:05, 122.75it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4567/4567 [00:36<00:00, 123.47it/s]


/home/husein/ssd3/gov.my/ultrachat-hansard-malay.jsonl pair-ultrachat-hansard-malay.jsonl


 41%|████████████████████████████████████████████▉                                                                 | 29671/72538 [12:10<15:43, 45.45it/s]

division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by zero
division by ze

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████| 72538/72538 [29:48<00:00, 40.55it/s]


/home/husein/ssd3/gov.my/ultrachat-ms-wikipedia.jsonl pair-ultrachat-ms-wikipedia.jsonl


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4408/4408 [00:15<00:00, 292.54it/s]


/home/husein/ssd3/gov.my/ultrachat-muftiwp.gov.my.texts.jsonl pair-ultrachat-muftiwp.gov.my.texts.jsonl


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 3834/3834 [00:26<00:00, 142.96it/s]


In [44]:
files = glob('pair-ultrachat-*.jsonl')
files

['pair-ultrachat-textbooks.jsonl',
 'pair-ultrachat-muftiwp.gov.my.texts.jsonl',
 'pair-ultrachat-epenerbitan-malay.jsonl',
 'pair-ultrachat-gov.my.jsonl',
 'pair-ultrachat-ms-wikipedia.jsonl',
 'pair-ultrachat-jurnaldbp.jsonl',
 'pair-ultrachat-astroawani-malay.jsonl',
 'pair-ultrachat-jurnaldbp-malay.jsonl',
 'pair-ultrachat-hansard-malay.jsonl',
 'pair-ultrachat-maktabahalbakri.com.jsonl']

In [42]:
with open('pair-ultrachat-textbooks.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        break

In [45]:
from huggingface_hub import HfApi
api = HfApi()

In [46]:
for f in files:
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/instructions-pair-mining',
        repo_type='dataset',
    )

pair-ultrachat-textbooks.jsonl


pair-ultrachat-textbooks.jsonl:   0%|          | 0.00/6.47G [00:00<?, ?B/s]

pair-ultrachat-muftiwp.gov.my.texts.jsonl


pair-ultrachat-muftiwp.gov.my.texts.jsonl:   0%|          | 0.00/501M [00:00<?, ?B/s]

pair-ultrachat-epenerbitan-malay.jsonl


pair-ultrachat-epenerbitan-malay.jsonl:   0%|          | 0.00/433M [00:00<?, ?B/s]

pair-ultrachat-gov.my.jsonl


pair-ultrachat-gov.my.jsonl:   0%|          | 0.00/864M [00:00<?, ?B/s]

pair-ultrachat-ms-wikipedia.jsonl


pair-ultrachat-ms-wikipedia.jsonl:   0%|          | 0.00/178M [00:00<?, ?B/s]

pair-ultrachat-jurnaldbp.jsonl


pair-ultrachat-jurnaldbp.jsonl:   0%|          | 0.00/307M [00:00<?, ?B/s]

pair-ultrachat-astroawani-malay.jsonl


pair-ultrachat-astroawani-malay.jsonl:   0%|          | 0.00/1.77G [00:00<?, ?B/s]

pair-ultrachat-jurnaldbp-malay.jsonl


pair-ultrachat-jurnaldbp-malay.jsonl:   0%|          | 0.00/745M [00:00<?, ?B/s]

pair-ultrachat-hansard-malay.jsonl


pair-ultrachat-hansard-malay.jsonl:   0%|          | 0.00/3.14G [00:00<?, ?B/s]

pair-ultrachat-maktabahalbakri.com.jsonl


pair-ultrachat-maktabahalbakri.com.jsonl:   0%|          | 0.00/264M [00:00<?, ?B/s]