In [1]:
# !wget https://huggingface.co/datasets/malaysia-ai/dedup-text-dataset/resolve/main/news.jsonl

In [2]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [3]:
import json
import mp
import random
import os
from tqdm import tqdm

In [4]:
data = []
with open('news.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        data.append(l)

2839348it [00:34, 81675.21it/s] 


In [5]:
parsed = []
for d in tqdm(data):
    if '   ' in d:
        splitted = d.split('   ')
    elif '\n' in d:
        splitted = d.split('\n')
        if len(splitted[0].split()) > 50:
            continue
    title = splitted[0]
    body = '\n'.join(splitted[1:])
    parsed.append({
        'title': title,
        'body': body,
    })

100%|██████████| 2839348/2839348 [00:22<00:00, 127296.33it/s]


In [6]:
import re

def clean(string):
    string = re.sub('[^A-Za-z ]+', ' ', string.lower())
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [7]:
ranged = range(len(parsed))

In [8]:
parsed[0]['title']

'Elon Musk Wants To Buy Manchester United Football Club'

In [9]:
def overlap(string1, string2):
    l = set([w for w in clean(string1).split() if len(w) > 2])
    r = set([w for w in clean(string2).split() if len(w) > 2])
    return len(l & r) / len(l)

overlap(parsed[0]['title'], parsed[0]['body'])

0.875

In [10]:
overlap(parsed[0]['title'], parsed[1]['body'])

0.0

In [11]:
sampled = random.sample(ranged, 100)
negs = []
for s in sampled:
    overlapped = overlap(parsed[0]['title'], parsed[s]['body'])
    if overlapped < 0.1:
        negs.append(parsed[s]['body'])
    if len(negs) >= 5:
        break
        
negs

['Stihl products at promotional price.\nSIBU: KTS Trading Sdn Bhd is holding a three-day ‘Customers Appreciation Day’ between 8am and 1pm until Dec 30 at KTS headquarters in Market Road here.\nAmong the items on promotion during this event are Stihl chainsaws, brush cutters, high pressure cleaners, battery shrub shears and lawn mowers.\nThere are also food items produced by KTS’s subsidiaries such as Daddy instant noodles, Royal B honey, Sabisco cracker sandwich and Wanfa fish snacks.\nThe annual event, which began yesterday is being held as a show of appreciation for its customers.',
 ' PUTRAJAYA, Aug 7 — Tourism, Arts and Culture Minister Datuk Mohamaddin Ketapi confirmed the ministry’s secretary-general\xa0(KSU) Datuk Isham Ishak\xa0was called to testify to assist the investigation of the Malaysian Anti-Corruption Commission (MACC) yesterday and today.\n However, he said he had not been told whether the ‘Datuk’ had been called in for an alleged abuse of power involving more than RM9

In [12]:
!mkdir mining-news

mkdir: cannot create directory ‘mining-news’: File exists


In [13]:
def loop(rows):
    rows, index = rows
    for i in tqdm(range(len(rows))):
        filename = os.path.join('mining-news', f'{i}-{index}.json')
        if os.path.exists(filename):
            continue
        
        sampled = random.sample(ranged, 100)
        negs = []
        for s in sampled:
            try:
                overlapped = overlap(rows[i]['title'], parsed[s]['body'])
            except:
                continue
            if overlapped < 0.1:
                negs.append(parsed[s]['body'])
            if len(negs) >= 5:
                break
        
        rows[i]['negs'] = negs
        with open(filename, 'w') as fopen:
            json.dump(rows[i], fopen)

In [14]:
loop((parsed[:100],0))

100%|██████████| 100/100 [00:00<00:00, 23189.61it/s]


In [15]:
mp.multiprocessing(parsed, loop, cores = 20, returned = False)

100%|██████████| 141899/141899 [00:00<00:00, 183099.97it/s]
100%|██████████| 141899/141899 [00:21<00:00, 6491.00it/s]  
100%|██████████| 141899/141899 [00:00<00:00, 216054.83it/s]
100%|██████████| 141899/141899 [00:00<00:00, 225560.41it/s]
 98%|█████████▊| 139506/141899 [00:00<00:00, 224674.07it/s]
100%|██████████| 141899/141899 [00:00<00:00, 249720.05it/s]
 98%|█████████▊| 139509/141899 [00:00<00:00, 280321.32it/s]
100%|██████████| 141899/141899 [00:00<00:00, 295238.69it/s]
100%|██████████| 141899/141899 [00:00<00:00, 295879.80it/s]
100%|██████████| 16/16 [00:00<00:00, 125672.03it/s]97it/s]
100%|██████████| 141899/141899 [00:00<00:00, 290359.20it/s]
100%|██████████| 141899/141899 [01:25<00:00, 1656.96it/s] 
100%|██████████| 141899/141899 [01:34<00:00, 1503.09it/s]
100%|██████████| 141899/141899 [01:25<00:00, 1653.44it/s]
100%|██████████| 141899/141899 [01:29<00:00, 1592.42it/s]
100%|██████████| 141899/141899 [01:53<00:00, 1249.73it/s]
100%|██████████| 141899/141899 [02:43<00:00, 868.8

In [16]:
from glob import glob

In [17]:
files = glob('mining-news/*.json')
len(files)

2837996

In [18]:
!df -h

Filesystem      Size  Used Avail Use% Mounted on
overlay         124G   37G   88G  30% /
tmpfs            64M     0   64M   0% /dev
/dev/sdc        984G  494G  490G  51% /home/ubuntu
/dev/sdd        9.8G  130M  9.7G   2% /dev/shm
/dev/root       124G   37G   88G  30% /etc/hosts
tmpfs           205G   12K  205G   1% /run/secrets/kubernetes.io/serviceaccount
tmpfs           109G   12K  109G   1% /proc/driver/nvidia
tmpfs            44G   73M   44G   1% /run/nvidia-persistenced/socket
tmpfs           109G     0  109G   0% /proc/acpi
tmpfs           109G     0  109G   0% /proc/scsi
tmpfs           109G     0  109G   0% /sys/firmware


In [None]:
with open('mining-news.jsonl', 'w') as fopen_l:
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

 38%|███▊      | 1069542/2837996 [11:27<41:45, 705.74it/s]  

In [24]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mining-news.jsonl',
    path_in_repo='mining-news.jsonl',
    repo_id='mesolitica/title-context-pair',
    repo_type='dataset',
)

mining-news.jsonl:   0%|          | 0.00/33.7G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/title-context-pair/commit/5e2dd9231dc471920c4ade9666dba40299e52215', commit_message='Upload mining-news.jsonl with huggingface_hub', commit_description='', oid='5e2dd9231dc471920c4ade9666dba40299e52215', pr_url=None, pr_revision=None, pr_num=None)