In [2]:
# !wget https://huggingface.co/datasets/mesolitica/mixtral-malaysian-abstractive-summarization/resolve/main/mixtral-malaysian-abstractive-summarization.jsonl

In [17]:
# !wget https://huggingface.co/datasets/mesolitica/semisupervised-abstractive-summarization-ms-news/resolve/main/populate-news.json.semisupervised

In [18]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [37]:
import json
import mp
import random
import os
from tqdm import tqdm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [14]:
data = []
with open('mixtral-malaysian-abstractive-summarization.jsonl') as fopen:
    for l in fopen:
        l = json.loads(l)
        data.append({
            'text': l['text'],
            'summary': l['summary'],
            'summary_ms': l['summary_ms']
        })

In [15]:
with open('populate-news.json.semisupervised') as fopen:
    for l in fopen:
        l = json.loads(l)
        summary = '\n'.join(l['semisupervised-summaries'])
        data.append({
            'text': l['text'],
            'summary': None,
            'summary_ms': summary
        })

In [20]:
import re

def clean(string):
    string = re.sub('[^A-Za-z ]+', ' ', string.lower())
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [40]:
def overlap(string1, string2):
    l = set([w for w in clean(string1).split() if len(w) > 2])
    r = set([w for w in clean(string2).split() if len(w) > 2])
    return len(l & r) / len(l)

overlap(data[0]['summary_ms'], data[0]['text'])

0.208955223880597

In [41]:
ranged = range(len(data))

In [42]:
sampled = random.sample(ranged, 100)
negs = []
for s in sampled:
    overlapped = overlap(data[0]['summary_ms'], data[s]['text'])
    if overlapped < 0.1:
        negs.append(data[s]['text'])
    if len(negs) >= 5:
        break
        
negs

['NUR-SULTAN 10 Jun - Kazakhstan memilih calon Presiden yang menjadi pilihan bekas Presiden Nursultan Nazarbayev dengan memperolehi 70.8 peratus undi, hari ini.\n\nSuruhanjaya Pilihan Raya Sentral berkata, Kassym-Jomart mengatasi calon pembangkang, Amirzhan Kosanov yang hanya mendapat 16.2 peratus undi.\n\nKemenangan Kassym-Jomart itu tidak diragukan lagi selepas beliau mendapat restu daripada Nursultan yang berkuasa dan memimpin negara tersebut sejak tiga dekad lalu.\n\nBagaimanapun, undian semalam menimbulkan protes terbesar sejak tiga tahun lalu yang mana seruan boikot pilihan raya dikatakan telah diatur. - AFP',
 'The relevant authorities in JB must not “do nothing” but must  use the controversy over Zachas’ “high-crime” mural to launch a new initiative to wipe out JB’s reputation as the nation’s crime capital   It is sad and tragic that while “unusual creativeness” are being exhibited to remove the “sting” of Lithuanian-born street artist Ernest Zacharevic’s “high crime” mural in 

In [43]:
!mkdir mining-summarization

In [44]:
def loop(rows):
    rows, index = rows
    for i in tqdm(range(len(rows))):
        filename = os.path.join('mining-summarization', f'{i}-{index}.json')
        if os.path.exists(filename):
            continue
        
        sampled = random.sample(ranged, 100)
        negs = []
        for s in sampled:
            try:
                overlapped = overlap(rows[i]['summary_ms'], data[s]['text'])
            except:
                continue
            if overlapped < 0.1:
                negs.append(data[s]['text'])
            if len(negs) >= 5:
                break
        
        rows[i]['negs'] = negs
        with open(filename, 'w') as fopen:
            json.dump(rows[i], fopen)

In [46]:
loop((data[:100],0))

100%|██████████| 100/100 [00:00<00:00, 277.26it/s]


In [48]:
mp.multiprocessing(data, loop, cores = 20, returned = False)

100%|██████████| 13865/13865 [00:56<00:00, 245.57it/s]
100%|██████████| 11/11 [00:00<00:00, 282.05it/s]6it/s]
100%|██████████| 13865/13865 [00:58<00:00, 238.08it/s]
100%|██████████| 13865/13865 [01:00<00:00, 227.55it/s]
100%|██████████| 13865/13865 [01:00<00:00, 228.34it/s]
100%|██████████| 13865/13865 [00:59<00:00, 233.11it/s]
100%|██████████| 13865/13865 [01:00<00:00, 230.26it/s]
100%|██████████| 13865/13865 [00:57<00:00, 242.65it/s]
100%|██████████| 13865/13865 [01:00<00:00, 230.91it/s]
100%|██████████| 13865/13865 [01:01<00:00, 224.80it/s]
100%|██████████| 13865/13865 [00:57<00:00, 239.72it/s]
100%|██████████| 13865/13865 [00:56<00:00, 245.50it/s]
100%|██████████| 13865/13865 [00:57<00:00, 239.51it/s]
100%|██████████| 13865/13865 [00:58<00:00, 235.80it/s]
100%|██████████| 13865/13865 [01:04<00:00, 214.36it/s]
100%|██████████| 13865/13865 [00:58<00:00, 237.18it/s]
100%|██████████| 13865/13865 [01:02<00:00, 221.28it/s]
100%|██████████| 13865/13865 [01:03<00:00, 219.51it/s]
100%|█████

In [49]:
from glob import glob

In [50]:
files = glob('mining-summarization/*.json')
len(files)

277311

In [51]:
with open('mining-summarization.jsonl', 'w') as fopen_l:
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

100%|██████████| 277311/277311 [01:08<00:00, 4059.73it/s]


In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mining-summarization.jsonl',
    path_in_repo='mining-summarization.jsonl',
    repo_id='mesolitica/title-context-pair',
    repo_type='dataset',
)

mining-summarization.jsonl:   0%|          | 0.00/4.65G [00:00<?, ?B/s]