In [2]:
# !wget https://huggingface.co/datasets/mesolitica/chatgpt-noisy-translation-facebook/resolve/main/processed-facebook.jsonl

In [3]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [4]:
from transformers import T5ForConditionalGeneration, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2',
    use_fast=False
)
model = T5ForConditionalGeneration.from_pretrained(
    'mesolitica/translation-t5-small-standard-bahasa-cased-v2'
)
all_special_ids = [0, 1, 2]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [5]:
_ = model.cuda()

In [1]:
import json
import re
import torch
from tqdm import tqdm

def clean(string):
    string = re.sub(
        'http\\S+|www.\\S+',
        '',
        ' '.join(
            [
                word
                for word in string.split()
                if word.find('#') < 0 and word.find('@') < 0
            ]
        ),
    )
    return string

In [8]:
ls = []
with open('processed-facebook.jsonl') as fopen:
    for l in tqdm(fopen):
        l = json.loads(l)
        l['cleaned'] = clean(l['left'])
        ls.append(l)

141851it [00:00, 244178.36it/s]


In [9]:
ls[0]

{'left': 'Kalau UMNOBN berkuasa, petrol tentu tidak dah subsidi. .. .',
 'en': 'If UMNOBN is in power, there will definitely be no petrol subsidy...',
 'ms': 'Jika UMNOBN berkuasa, pasti tidak akan ada subsidi petrol...',
 'cleaned': 'Kalau UMNOBN berkuasa, petrol tentu tidak dah subsidi. .. .'}

In [9]:
# !rm -rf facebook-predict
!mkdir facebook-predict

In [10]:
pairs = {
    'en': 'Manglish',
    'ms': 'pasar Melayu'
}

In [None]:
for i in tqdm(range(len(ls))):
    filename = os.path.join('facebook-predict', f'{i}.json')
    if os.path.exists(filename):
        continue
        
    results = {'original': ls[i]}
        
    for lang, prefix in pairs.items():
    
        if ls[i][lang] and len(ls[i][lang]) > 5:
            s = ls[i][lang]
            input_ids = tokenizer.encode(f'terjemah ke {prefix}: {s}', return_tensors = 'pt')
            outputs = model.generate(input_ids.cuda(), max_length = 256, do_sample=True,
                top_k=50,
                top_p=0.95,
                num_return_sequences=5, temperature = 0.7, output_scores = True, return_dict_in_generate = True)
            logits = torch.stack(outputs.scores, dim=1)
            score = logits.max(dim = -1).values.mean(dim = -1).detach().cpu().numpy().tolist()
            seqs = []
            for o in outputs.sequences:
                o = tokenizer.decode([i for i in o if i not in all_special_ids], 
                                     spaces_between_special_tokens = False)
                seqs.append(o)
            
            results[lang] = {
                'score': score,
                'sequences': seqs
            }
        
    with open(filename, 'w') as fopen:
        json.dump(results, fopen)

  9%|████████                                                                                | 13067/141851 [1:17:28<5:56:46,  6.02it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 13%|███████████▍                                                                            | 18535/141851 [1:49:46<7:01:08,  4.88it/s]IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

 26%|██████████████████████▎                                                  

In [2]:
from glob import glob
import json

In [3]:
files = glob('facebook-predict/*.json')
len(files)

124623

In [4]:
predicted = []
for f in tqdm(files):
    try:
        with open(f) as fopen:
            l = json.load(fopen)
        predicted.append(l)
    except:
        pass

100%|█████████████████████████████████████████████████████████████████████████████████████████| 124623/124623 [00:18<00:00, 6807.49it/s]


In [5]:
predicted[0]

{'original': {'left': 'tolong lah yang lain bongkarkan kesalahan ahli-ahli PAKATAN HARAPAN pula. . tak dengar lagi di mana-mana ini. .',
  'en': "Please, let others point out the mistakes of PH members too. I haven't heard it from anywhere else.",
  'ms': 'Tolonglah biarkan orang lain menunjukkan kesalahan ahli-ahli PH juga. Saya belum mendengarnya dari mana-mana lagi.',
  'cleaned': 'tolong lah yang lain bongkarkan kesalahan ahli-ahli PAKATAN HARAPAN pula. . tak dengar lagi di mana-mana ini. .'},
 'en': {'score': [29.973703384399414,
   28.232908248901367,
   28.72986602783203,
   29.54612922668457,
   28.832304000854492],
  'sequences': ['please lah let others point out the mistakes of ph members too i have not heard from anywhere else',
   'please let others point out the mistakes of ph members too i dun heard liao',
   'please let others point out the mistake of ph members too lah i never heard from anywhere',
   'please let others point out the mistakes of ph members also lah i ha

In [6]:
with open('noisy-augmentation-facebook.jsonl', 'w') as fopen:
    for p in tqdm(predicted):
        fopen.write(f'{json.dumps(p)}\n')

100%|███████████████████████████████████████████████████████████████████████████████████████| 124623/124623 [00:01<00:00, 122999.84it/s]


In [7]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='noisy-augmentation-facebook.jsonl',
    path_in_repo='noisy-augmentation-facebook.jsonl',
    repo_id='mesolitica/noisy-augmentation',
    repo_type='dataset',
)



noisy-augmentation-facebook.jsonl:   0%|          | 0.00/148M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/noisy-augmentation/commit/9f8d68771053d9dcc6a9355556311673d780794d', commit_message='Upload noisy-augmentation-facebook.jsonl with huggingface_hub', commit_description='', oid='9f8d68771053d9dcc6a9355556311673d780794d', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
ranged = range(len(ls))

In [11]:
def overlap(string1, string2):
    l = set([w for w in clean(string1).split() if len(w) > 2])
    r = set([w for w in clean(string2).split() if len(w) > 2])
    return len(l & r) / len(l)

overlap(ls[0]['left'], ls[1]['left'])

0.0

In [12]:
import random

sampled = random.sample(ranged, 100)
negs = []
for s in sampled:
    overlapped = overlap(ls[0]['ms'], ls[s]['ms'])
    if overlapped < 0.05:
        negs.append(ls[s]['left'])
    if len(negs) >= 5:
        break
        
negs

['Teruskan berangan PM tepi selamanya',
 'DSAI mantap',
 'Rudy Ahmad jangan risau. kalau tengok kat tik tok, banyak anak-anak muda celik mata depa ke arah PN',
 'Azidie Rahim haha kena lipakk pulok ',
 'Didi Zahari Muhd Hafiz mari join sini']

In [13]:
ls[0]['ms']

'Jika UMNOBN berkuasa, pasti tidak akan ada subsidi petrol...'

In [14]:
!rm -rf mining-facebook
!mkdir mining-facebook

In [15]:
import os

def loop(rows):
    rows, index = rows
    for i in tqdm(range(len(rows))):
        filename = os.path.join('mining-facebook', f'{i}-{index}.json')
        if os.path.exists(filename):
            continue
        
        sampled = random.sample(ranged, 100)
        negs = []
        for s in sampled:
            try:
                overlapped = overlap(rows[i]['original']['ms'], ls[s]['ms'])
            except:
                continue
            if overlapped < 0.05:
                negs.append(ls[s]['left'])
            if len(negs) >= 5:
                break
                
        en = rows[i]['original']['en']
        ms = rows[i]['original']['ms']
        
        en_augmentation = []
        try:
            for no, score in enumerate(rows[i]['en']['score']):
                if score > 30:
                    en_augmentation.append(rows[i]['en']['sequences'][no])
            en_augmentation = list(set(en_augmentation))
        except:
            pass
        
        ms_augmentation = []
        try:
            for no, score in enumerate(rows[i]['ms']['score']):
                if score > 30:
                    ms_augmentation.append(rows[i]['ms']['sequences'][no])
            ms_augmentation = list(set(ms_augmentation))
        except:
            pass
        
        try:
            d = {
                'negs': negs,
                'pos': list(set([en, ms] + en_augmentation + ms_augmentation)),
                'query': rows[i]['original']['left'],
            }

            with open(filename, 'w') as fopen:
                json.dump(d, fopen)
        except:
            pass

In [16]:
loop((predicted[:1000],0))

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 1000/1000 [00:00<00:00, 8936.70it/s]


In [17]:
import mp

In [18]:
mp.multiprocessing(predicted, loop, cores = 10, returned = False)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 9976.82it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 9089.57it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 8737.16it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 9033.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 8976.58it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 8864.86it/s]
100%|███████████████████████████████████████████████████████████████████████████████████████████| 12462/12462 [00:01<00:00, 8970.25it/s]
100%|████████████████████████████████████

In [19]:
files = glob('mining-facebook/*.json')
len(files)

124623

In [20]:
with open('mining-facebook.jsonl', 'w') as fopen_l:
    for f in tqdm(files):
        try:
            with open(f) as fopen:
                data = json.load(fopen)
            fopen_l.write(f'{json.dumps(data)}\n')
            fopen_l.flush()
        except:
            pass

100%|████████████████████████████████████████████████████████████████████████████████████████| 124623/124623 [00:01<00:00, 69441.22it/s]


In [21]:
from huggingface_hub import HfApi
api = HfApi()
api.upload_file(
    path_or_fileobj='mining-facebook.jsonl',
    path_in_repo='mining-facebook.jsonl',
    repo_id='mesolitica/title-context-pair',
    repo_type='dataset',
)

mining-facebook.jsonl:   0%|          | 0.00/90.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/title-context-pair/commit/54a8f97f79728f3c6fd4255416c1509b402bc967', commit_message='Upload mining-facebook.jsonl with huggingface_hub', commit_description='', oid='54a8f97f79728f3c6fd4255416c1509b402bc967', pr_url=None, pr_revision=None, pr_num=None)