In [1]:
# !wget https://huggingface.co/datasets/mesolitica/chatgpt-noisy-translation-iium-confession/resolve/main/processed-iium-confession.jsonl

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2',
    use_fast=False
)
model = T5ForConditionalGeneration.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2'
)
all_special_ids = [0, 1, 2]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
_ = model.cuda()

In [1]:
import json
import re
import torch
from tqdm import tqdm

def clean(string):
    string = re.sub(
        'http\\S+|www.\\S+',
        '',
        ' '.join(
            [
                word
                for word in string.split()
                if word.find('#') < 0 and word.find('@') < 0
            ]
        ),
    )
    return string

In [2]:
ls = []
with open('processed-iium-confession.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        l['cleaned'] = clean(l['left'])
        ls.append(l)

333758it [00:03, 101393.92it/s]


In [7]:
ls[0]

{'left': 'Assalamualaikum dan salam sejahtera. Hai geng. J is back. Sihat semua? Family sihat? Aku doakan semua sihat-sihat belaka. Terima kasih support aku. Aku cuma nak cerita kisah aku dan kawan-kawan aku. Kerja di ladang ni dah lama di anak tirikan oleh masyarakat. Sedih juga aku. Kami duduk hutan sawit ni tak kacau orang pun. Tapi pandangan masyarakat tu, hurmmm... Entahla.',
 'en': "Assalamualaikum and greetings. Hey guys, J is back. Are you all well? Is your family well? I pray that everyone is in good health. Thank you for your support. I just want to share my story and my friends' story. We have been working in this plantation for a long time and have been neglected by the community. It's sad. We live in this oil palm forest and don't bother anyone. But the community's perception, well... I don't know.",
 'ms': 'Assalamualaikum dan salam sejahtera. Hai geng. J kembali. Semua sihat? Keluarga sihat? Saya berdoa agar semua sihat-sihat belaka. Terima kasih atas sokongan anda. Saya

In [8]:
# %%time

# s = 'Dalam hujan lebat some more'
# input_ids = tokenizer.encode(f'terjemah ke pasar Melayu: {s}', return_tensors = 'pt')
# outputs = model.generate(input_ids.cuda(), max_length = 100, do_sample=True,
#     top_k=50,
#     top_p=0.95,
#     num_return_sequences=5, temperature = 1.0, output_scores = True, return_dict_in_generate = True)
# seqs = []
# for o in outputs.sequences:
#     o = tokenizer.decode([i for i in o if i not in all_special_ids], 
#                          spaces_between_special_tokens = False)
#     seqs.append(o)
# seqs

In [9]:
# !rm -rf iium-confessions-predict
!mkdir iium-confessions-predict

mkdir: cannot create directory ‘iium-confessions-predict’: File exists


In [10]:
pairs = {
    'en': 'Manglish',
    'ms': 'pasar Melayu'
}

In [None]:
for i in tqdm(range(len(ls))):
    filename = os.path.join('iium-confessions-predict', f'{i}.json')
    if os.path.exists(filename):
        continue
        
    results = {'original': ls[i]}
        
    for lang, prefix in pairs.items():
    
        if ls[i][lang] and len(ls[i][lang]) > 5:
            s = ls[i][lang]
            input_ids = tokenizer.encode(f'terjemah ke {prefix}: {s}', return_tensors = 'pt')
            outputs = model.generate(input_ids.cuda(), max_length = 512, do_sample=True,
                top_k=50,
                top_p=0.95,
                num_return_sequences=5, temperature = 0.7, output_scores = True, return_dict_in_generate = True)
            logits = torch.stack(outputs.scores, dim=1)
            score = logits.max(dim = -1).values.mean(dim = -1).detach().cpu().numpy().tolist()
            seqs = []
            for o in outputs.sequences:
                o = tokenizer.decode([i for i in o if i not in all_special_ids], 
                                     spaces_between_special_tokens = False)
                seqs.append(o)
            
            results[lang] = {
                'score': score,
                'sequences': seqs
            }
        
    with open(filename, 'w') as fopen:
        json.dump(results, fopen)

 41%|██████████████████████████████████▊                                                  | 136934/333758 [37:53:25<44:19:27,  1.23it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 45%|█████████████████████████████████████▍                                              | 148768/333758 [43:26:21<124:55:03,  2.43s/it]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 45%|██████████████████████████████████████▍                                  

In [3]:
from glob import glob
import json

In [4]:
files = glob('iium-confessions-predict/*.json')
len(files)

150849

In [6]:
predicted = []
for f in tqdm(files):
    try:
        with open(f) as fopen:
            l = json.load(fopen)
        predicted.append(l)
    except:
        pass

100%|████████████████████████████████████████████████████████████████████████████████████████| 150849/150849 [00:08<00:00, 17970.56it/s]


In [7]:
predicted[0]

{'original': {'left': 'Second, baca Al-quran, mathurat and such things. Kalau tiba tiba terasa nafsu tu naik, cepat cepat ambik wudhu and grab Al-quran. Baca terjemahan dia sekali.',
  'en': 'Second, read the Quran, mathurat and other similar things. If suddenly you feel your desires rising, quickly perform ablution and grab the Quran. Read its translation once.',
  'ms': 'Kedua, bacalah Al-Quran, mathurat dan perkara-perkara yang serupa. Jika tiba-tiba anda merasa nafsu anda meningkat, segera lakukan wuduk dan ambil Al-Quran. Baca terjemahannya sekali.',
  'cleaned': 'Second, baca Al-quran, mathurat and such things. Kalau tiba tiba terasa nafsu tu naik, cepat cepat ambik wudhu and grab Al-quran. Baca terjemahan dia sekali.'},
 'en': {'score': [24.915721893310547,
   27.981647491455078,
   27.686424255371094,
   27.345184326171875,
   26.930335998535156],
  'sequences': ['second lah baca quran mathurat bc bc bc bc bc bc bc bc bc bc bc bc',
   'kedua baca quran mathurat dan lain lain ka

In [8]:
with open('noisy-augmentation-iium-confession.jsonl', 'w') as fopen:
    for p in tqdm(predicted):
        fopen.write(f'{json.dumps(p)}\n')

100%|████████████████████████████████████████████████████████████████████████████████████████| 150846/150846 [00:01<00:00, 77151.93it/s]


In [9]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='noisy-augmentation-iium-confession.jsonl',
    path_in_repo='noisy-augmentation-iium-confession.jsonl',
    repo_id='mesolitica/noisy-augmentation',
    repo_type='dataset',
)



noisy-augmentation-iium-confession.jsonl:   0%|          | 0.00/579M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/noisy-augmentation/commit/4d4fefc23e479f466673f47726eb3d1565f5888f', commit_message='Upload noisy-augmentation-iium-confession.jsonl with huggingface_hub', commit_description='', oid='4d4fefc23e479f466673f47726eb3d1565f5888f', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
ranged = range(len(ls))

In [11]:
def overlap(string1, string2):
    l = set([w for w in clean(string1).split() if len(w) > 2])
    r = set([w for w in clean(string2).split() if len(w) > 2])
    return len(l & r) / len(l)

overlap(ls[0]['left'], ls[1]['left'])

0.18

In [12]:
import random

sampled = random.sample(ranged, 100)
negs = []
for s in sampled:
    overlapped = overlap(ls[0]['ms'], ls[s]['ms'])
    if overlapped < 0.05:
        negs.append(ls[s]['left'])
    if len(negs) >= 5:
        break
        
negs

['Saya dapati masyarakat Melayu di Malaysia pada hari ini, sukar ataupun tidak mahu menerima hakikat tentang asal-usul warna kulit mereka sendiri.',
 'Jadi aku kunyah slow2 sambil rasa2 sikit apabenda makanan yang aku kunyah tu. Wehh tiba2 benda tu aku rasa macam lain macam wehh. Rasa dia pelik sangat2 and sumpah aku tak pernah rasa makanan sepelik tu. Aku pun dah tak hirau dah kawan2 aku tengah makan dengan selera ke apa time tu. Aku terus keluarkan balik semua isi dalam mulut aku tu. Tak cukup dengan tu, aku ambik air pastu kumur2 aku ludah je dalam tray tu (budak asrama memang makan nasi dalam tray). Masa tu kawan2 aku semua tak tahu cerita, and diorang marah gila kat aku sebab aku buat macam tu. Yelah, siapa tak marah kan, kau tengah syok makan tapi ada orang buat aksi jijik depan kau. Aku pun cakap kat diorang, aku rasa aku terkunyah benda pelik. Sekali bila aku kuis2 balik makanan yang aku kelurkan tadi, memang aku terkejut tengok sebab rupa dia macam tebuan. Bila aku belek2 lagi

In [14]:
ls[0]['ms']

'Assalamualaikum dan salam sejahtera. Hai geng. J kembali. Semua sihat? Keluarga sihat? Saya berdoa agar semua sihat-sihat belaka. Terima kasih atas sokongan anda. Saya hanya ingin berkongsi kisah saya dan kisah rakan-rakan saya. Kami telah bekerja di ladang ini untuk waktu yang lama dan telah diabaikan oleh masyarakat. Sedih juga. Kami tinggal di hutan sawit ini dan tidak mengganggu sesiapa pun. Tetapi pandangan masyarakat, hurmmm... Entahlah.'

In [15]:
!rm -rf mining-iium-confession
!mkdir mining-iium-confession

In [20]:
import os

def loop(rows):
    rows, index = rows
    for i in tqdm(range(len(rows))):
        filename = os.path.join('mining-iium-confession', f'{i}-{index}.json')
        if os.path.exists(filename):
            continue
        
        sampled = random.sample(ranged, 100)
        negs = []
        for s in sampled:
            try:
                overlapped = overlap(rows[i]['original']['ms'], ls[s]['ms'])
            except:
                continue
            if overlapped < 0.05:
                negs.append(ls[s]['left'])
            if len(negs) >= 5:
                break
                
        en = rows[i]['original']['en']
        ms = rows[i]['original']['ms']
        
        en_augmentation = []
        try:
            for no, score in enumerate(rows[i]['en']['score']):
                if score > 30:
                    en_augmentation.append(rows[i]['en']['sequences'][no])
            en_augmentation = list(set(en_augmentation))
        except:
            pass
        
        ms_augmentation = []
        try:
            for no, score in enumerate(rows[i]['ms']['score']):
                if score > 30:
                    ms_augmentation.append(rows[i]['ms']['sequences'][no])
            ms_augmentation = list(set(ms_augmentation))
        except:
            pass
        
        try:
            d = {
                'negs': negs,
                'pos': list(set([en, ms] + en_augmentation + ms_augmentation)),
                'query': rows[i]['original']['left'],
            }

            with open(filename, 'w') as fopen:
                json.dump(d, fopen)
        except:
            pass

In [21]:
loop((predicted[:1000],0))

100%|███████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 531260.80it/s]


In [18]:
import mp

In [22]:
mp.multiprocessing(predicted, loop, cores = 10, returned = False)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 15084/15084 [00:06<00:00, 2465.51it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 15084/15084 [00:02<00:00, 5914.77it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 6/6 [00:00<00:00, 65879.12it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 15084/15084 [00:09<00:00, 1513.10it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 15084/15084 [00:07<00:00, 2139.75it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 15084/15084 [00:10<00:00, 1471.85it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 15084/15084 [00:08<00:00, 1868.40it/s]
100%|████████████████████████████████████

In [23]:
files = glob('mining-iium-confession/*.json')
len(files)

150827

In [24]:
with open('mining-iium-confession.jsonl', 'w') as fopen_l:
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

100%|████████████████████████████████████████████████████████████████████████████████████████| 150827/150827 [00:03<00:00, 48484.74it/s]


In [25]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mining-iium-confession.jsonl',
    path_in_repo='mining-iium-confession.jsonl',
    repo_id='mesolitica/title-context-pair',
    repo_type='dataset',
)

mining-iium-confession.jsonl:   0%|          | 0.00/541M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/title-context-pair/commit/0db19fe37f6f479adcb5b7902cfc590ba6af6922', commit_message='Upload mining-iium-confession.jsonl with huggingface_hub', commit_description='', oid='0db19fe37f6f479adcb5b7902cfc590ba6af6922', pr_url=None, pr_revision=None, pr_num=None)