In [1]:
# !wget https://huggingface.co/datasets/mesolitica/chatgpt-noisy-translation-twitter/resolve/main/processed-twitter.jsonl

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [3]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2',
    use_fast=False
)
model = T5ForConditionalGeneration.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2'
)
all_special_ids = [0, 1, 2]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [4]:
_ = model.cuda()

In [15]:
import json
import re
import torch
from tqdm import tqdm

def clean(string):
    string = re.sub(
        'http\\S+|www.\\S+',
        '',
        ' '.join(
            [
                word
                for word in string.split()
                if word.find('#') < 0 and word.find('@') < 0
            ]
        ),
    )
    string = re.sub('[^A-Za-z ]+', ' ', string.lower())
    string = re.sub(r'[ ]+', ' ', string).strip()
    return string

In [7]:
ls = []
with open('processed-twitter.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        l['cleaned'] = clean(l['left'])
        ls.append(l)

691013it [00:03, 209301.44it/s]


In [8]:
len(ls)

691013

In [7]:
ls[0]

{'left': '@mazwinnikanis Dalam hujan lebat some more',
 'en': 'In heavy rain some more',
 'ms': 'Dalam hujan lebat lagi',
 'cleaned': 'Dalam hujan lebat some more'}

In [8]:
# %%time

# s = 'Dalam hujan lebat some more'
# input_ids = tokenizer.encode(f'terjemah ke pasar Melayu: {s}', return_tensors = 'pt')
# outputs = model.generate(input_ids.cuda(), max_length = 100, do_sample=True,
#     top_k=50,
#     top_p=0.95,
#     num_return_sequences=5, temperature = 1.0, output_scores = True, return_dict_in_generate = True)
# seqs = []
# for o in outputs.sequences:
#     o = tokenizer.decode([i for i in o if i not in all_special_ids], 
#                          spaces_between_special_tokens = False)
#     seqs.append(o)
# seqs

In [9]:
# !rm -rf twitter-predict
!mkdir twitter-predict

mkdir: cannot create directory ‘twitter-predict’: File exists


In [10]:
pairs = {
    'en': 'Manglish',
    'ms': 'pasar Melayu'
}

In [None]:
for i in tqdm(range(len(ls))):
    filename = os.path.join('twitter-predict', f'{i}.json')
    if os.path.exists(filename):
        continue
        
    results = {'original': ls[i]}
        
    for lang, prefix in pairs.items():
    
        if ls[i][lang] and len(ls[i][lang]) > 5:
            s = ls[i][lang]
            input_ids = tokenizer.encode(f'terjemah ke {prefix}: {s}', return_tensors = 'pt')
            outputs = model.generate(input_ids.cuda(), max_length = 100, do_sample=True,
                top_k=50,
                top_p=0.95,
                num_return_sequences=5, temperature = 0.7, output_scores = True, return_dict_in_generate = True)
            logits = torch.stack(outputs.scores, dim=1)
            score = logits.max(dim = -1).values.mean(dim = -1).detach().cpu().numpy().tolist()
            seqs = []
            for o in outputs.sequences:
                o = tokenizer.decode([i for i in o if i not in all_special_ids], 
                                     spaces_between_special_tokens = False)
                seqs.append(o)
            
            results[lang] = {
                'score': score,
                'sequences': seqs
            }
        
    with open(filename, 'w') as fopen:
        json.dump(results, fopen)

 46%|██████████████████████████████████████▉                                              | 316587/691013 [19:46:02<65:45:49,  1.58it/s]

In [4]:
from glob import glob
import json

In [2]:
files = glob('twitter-predict/*.json')
len(files)

316607

In [25]:
predicted = []
for f in tqdm(files):
    with open(f) as fopen:
        l = json.load(fopen)
    predicted.append(l)

100%|████████████████████████████████████████████████████████████████████████████████████████| 316607/316607 [00:27<00:00, 11537.94it/s]


In [28]:
predicted[0]

{'original': {'left': 'Aku esok shooting kat Sabak Bernam tak tahulah macam mana cerita dia sebab bila baca SOP baru macam migrain sikit. Hahahaha.',
  'en': 'I have a shoot in Sabak Bernam tomorrow, not sure what the story is about because reading the SOP gives me a bit of a headache. Hahaha.',
  'ms': 'Saya ada tembakan di Sabak Bernam esok, tidak pasti apa ceritanya kerana membaca SOP membuat saya sedikit migrain. Hahaha.',
  'cleaned': 'Aku esok shooting kat Sabak Bernam tak tahulah macam mana cerita dia sebab bila baca SOP baru macam migrain sikit. Hahahaha.'},
 'en': {'score': [30.524616241455078,
   30.882753372192383,
   30.931371688842773,
   30.892059326171875,
   30.45795249938965],
  'sequences': ['sabaq bernam shoot esok taktau lah cerita apa ni sbb baca sop sakit kepala sikit hahahahaha',
   'shoot sabak bernam esok tak sure cerita apa sebab baca sop sakit kepala sikit hahaha',
   'sabak bernam shoot esok tak tahu lah cerita apa sebab baca sop sakit kepala sikit hahahaha'

In [27]:
with open('noisy-augmentation-twitter.jsonl', 'w') as fopen:
    for p in tqdm(predicted):
        fopen.write(f'{json.dumps(p)}\n')

100%|███████████████████████████████████████████████████████████████████████████████████████| 316607/316607 [00:02<00:00, 112953.31it/s]


In [29]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='noisy-augmentation-twitter.jsonl',
    path_in_repo='noisy-augmentation-twitter.jsonl',
    repo_id='mesolitica/noisy-augmentation',
    repo_type='dataset',
)



noisy-augmentation-twitter.jsonl:   0%|          | 0.00/482M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/noisy-augmentation/commit/afe9f82a5b85b8bfc258d5348d9821a67ab7d45e', commit_message='Upload noisy-augmentation-twitter.jsonl with huggingface_hub', commit_description='', oid='afe9f82a5b85b8bfc258d5348d9821a67ab7d45e', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
ranged = range(len(ls))

In [16]:
ls[0]

{'left': '@mazwinnikanis Dalam hujan lebat some more',
 'en': 'In heavy rain some more',
 'ms': 'Dalam hujan lebat lagi',
 'cleaned': 'Dalam hujan lebat some more'}

In [17]:
def overlap(string1, string2):
    l = set([w for w in clean(string1).split() if len(w) > 2])
    r = set([w for w in clean(string2).split() if len(w) > 2])
    return len(l & r) / len(l)

overlap(ls[0]['left'], ls[1]['left'])

0.0

In [23]:
import random

sampled = random.sample(ranged, 100)
negs = []
for s in sampled:
    overlapped = overlap(ls[0]['ms'], ls[s]['ms'])
    if overlapped < 0.05:
        negs.append(ls[s]['left'])
    if len(negs) >= 5:
        break
        
negs

['Anyone nak belanja??  https://t.co/z5ColdjELX',
 'Gaiisssss member ni single gaiiss @KhaiiJohn , pape DM, kalau dia tak reply maybe dia tak tertarik kowt dkt org tu, https://t.co/rfLLwqThK1',
 '@TuneTalk Dah seminggu line slow giler2 kat kampung paya siput, lanchang pahang.. sila ambil tindakan',
 'Selamat Pagi Lombok Tengah,\nRabu sebenarnya hari yang beruntung. Kenapa? Sebab, ada di tengah pekan. Tetap aman wal https://t.co/X031iLDmFm',
 'dok pusing ke klang and shah alam, sesak mcm biasa je. mana roadblock?']

In [30]:
ls[0]

{'left': '@mazwinnikanis Dalam hujan lebat some more',
 'en': 'In heavy rain some more',
 'ms': 'Dalam hujan lebat lagi',
 'cleaned': 'Dalam hujan lebat some more'}

In [40]:
en_augmentation = []
for no, score in enumerate(predicted[1]['en']['score']):
    if score > 30:
        en_augmentation.append(predicted[1]['en']['sequences'][no])
        
en_augmentation

['how to make asbf sounds interesting leh',
 'how to make asbf sounds interesting lor',
 'How to make asbf. Sounds interesting']

In [76]:
predicted[4]

{'original': {'left': 'Geng 12 hb ni ade tak yang nak balik terengganu ade satu ticket dah beli tapi tak jadi pergi nak bagi harga murah j https://t.co/15KiWG9vfh',
  'en': "Anyone going back to Terengganu on 12th September? I have a ticket but can't go. Willing to sell at a cheaper price. DM me.",
  'ms': 'Ada sesiapa nak balik Terengganu pada 12 September? Saya ada tiket tapi tak dapat pergi. Sedia untuk jual pada harga yang lebih murah. DM saya.',
  'cleaned': 'Geng 12 hb ni ade tak yang nak balik terengganu ade satu ticket dah beli tapi tak jadi pergi nak bagi harga murah j '},
 'en': {'score': [32.828792572021484,
   33.19839859008789,
   32.189056396484375,
   31.707237243652344,
   32.984588623046875],
  'sequences': ['sesiapa balik terengganu 12sept takde tiket tak boleh pi nak jual harga lagi murah dm me',
   'ada yang balik terengganu 12 sept ni ada ticket tapi takleh nak pi bakalan jual murah dm sikit',
   'Anyone going back to Terengganu on 12 September? I got a ticket but 

In [38]:
predicted[1]['en']

{'score': [30.87648582458496,
  30.03171157836914,
  29.950002670288086,
  32.05991744995117,
  29.388994216918945],
 'sequences': ['how to make asbf sounds interesting leh',
  'how to make asbf sounds interesting lor',
  'how to make asbf sound interesting lah',
  'How to make asbf. Sounds interesting',
  'how to make asbf sounds interesting']}

In [60]:
!rm -rf mining-twitter
!mkdir mining-twitter

In [61]:
import os

def loop(rows):
    rows, index = rows
    for i in tqdm(range(len(rows))):
        filename = os.path.join('mining-twitter', f'{i}-{index}.json')
        if os.path.exists(filename):
            continue
        
        sampled = random.sample(ranged, 100)
        negs = []
        for s in sampled:
            try:
                overlapped = overlap(rows[i]['original']['ms'], ls[s]['ms'])
            except:
                continue
            if overlapped < 0.05:
                negs.append(ls[s]['left'])
            if len(negs) >= 5:
                break
                
        en = rows[i]['original']['en']
        ms = rows[i]['original']['ms']
        
        en_augmentation = []
        try:
            for no, score in enumerate(rows[i]['en']['score']):
                if score > 30:
                    en_augmentation.append(rows[i]['en']['sequences'][no])
            en_augmentation = list(set(en_augmentation))
        except:
            pass
        
        ms_augmentation = []
        try:
            for no, score in enumerate(rows[i]['ms']['score']):
                if score > 30:
                    ms_augmentation.append(rows[i]['ms']['sequences'][no])
            ms_augmentation = list(set(ms_augmentation))
        except:
            pass
                
        d = {
            'negs': negs,
            'pos': list(set([en, ms] + en_augmentation + ms_augmentation)),
            'query': rows[i]['original']['left'],
        }
        
        with open(filename, 'w') as fopen:
            json.dump(d, fopen)

In [59]:
loop((predicted[:1000],0))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 73035.87it/s]


In [64]:
# !wget https://gist.githubusercontent.com/huseinzol05/98974ae8c6c7a65d4bc0af9f5003786a/raw/2e06e71ef7349a57bc58cc9913ae6bae1f9f8447/mp.py

In [66]:
import mp

In [67]:
mp.multiprocessing(predicted, loop, cores = 10, returned = False)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:05<00:00, 5372.80it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:05<00:00, 5310.54it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:05<00:00, 5282.61it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:06<00:00, 5224.26it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:05<00:00, 5295.88it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:06<00:00, 5248.44it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 31660/31660 [00:05<00:00, 5283.67it/s]
100%|████████████████████████████████████

In [68]:
files = glob('mining-twitter/*.json')
len(files)

316607

In [69]:
with open('mining-twitter.jsonl', 'w') as fopen_l:
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

100%|████████████████████████████████████████████████████████████████████████████████████████| 316607/316607 [00:12<00:00, 24393.06it/s]


In [72]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mining-twitter.jsonl',
    path_in_repo='mining-twitter.jsonl',
    repo_id='mesolitica/title-context-pair',
    repo_type='dataset',
)

mining-twitter.jsonl:   0%|          | 0.00/394M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/title-context-pair/commit/93f020b3dc6def5325ec2cbee6de903e38ad74de', commit_message='Upload mining-twitter.jsonl with huggingface_hub', commit_description='', oid='93f020b3dc6def5325ec2cbee6de903e38ad74de', pr_url=None, pr_revision=None, pr_num=None)

In [77]:
data

{'negs': ['Rabu / 18 Mei 2022 / 17 Syawal 1443H\n5:51pg - Masuk waktu solat fardhu #Subuh bagi Pulau Pinang &amp; kwsn yg sama wakt https://t.co/zfdy4mpC1x',
  '@nilamsaniiiiii @idek_hm ngak g gya ... matik bak isik borang sen agy',
  'Labuan Bajo adalah salah satu destinasi wisata super prioritas dan premium, namun kami ingin agar wisatawan nusatar https://t.co/hVzQPVLFLS',
  '@BangRiz91376468 Mau, dong....',
  'gak kasian sama aku ya? oke anak kita nambah https://t.co/WG6a1DyT0g'],
 'pos': ['For your information, last week during Eid al-Fitr, Ms Maharani was known to have had a political meeting with the Chairman of https://t.co/E1ITii7Pcl.',
  'Sebagai makluman, minggu lalu semasa raya al-fitri, Puan Maharani diketahui telah mengadakan pertemuan politik dengan Pengerusi https://t.co/E1ITii7Pcl',
  'Untuk pengetahuan, pekan lalu saat lebaran, Puan Maharani diketahui telah mengadakan pertemuan politik dengan Kepala Maj https://t.co/E1ITii7Pcl',
  'Sebagai informasi, pekan lalu selama 