In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-base-noisy-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-base-noisy-bahasa-cased/checkpoint-230000',
 'finetune-t5-base-noisy-bahasa-cased/checkpoint-240000',
 'finetune-t5-base-noisy-bahasa-cased/checkpoint-250000',
 'finetune-t5-base-noisy-bahasa-cased/checkpoint-260000',
 'finetune-t5-base-noisy-bahasa-cased/checkpoint-270000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [4]:
input_ids = tokenizer.encode('terjemah Melayu ke Inggeris: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hi guys! I noticed yesterday and today many of these cookies are available. So today I want to share some post mortem of our first batch:</s>


In [5]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai kawan! Saya perhatikan semalam & harini ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:</s>


In [6]:
strings = [
    'ak tak paham la',
    'jam 8 di pasar KK memang org ramai üòÇ, pandai dia pilih tmpt.',
    'Jadi haram jadahüòÄüòÉü§≠',
    'nak gi mana tuu',
    'Macam nak ambil half day',
    "Bayangkan PH dan menang pru-14. Pastu macam-macam pintu belakang ada. Last-last Ismail Sabri naik. That's why I don't give a fk about politics anymore. Sumpah dah fk up dah.",
]
for s in strings:
    input_ids = tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')
    outputs = model.generate(input_ids, max_length = 100, )
    print(tokenizer.decode(outputs[0]))

<pad> I don't understand la.</s>
<pad> At 8 a.m., the market is a lot of people, so he's a good person.</s>
<pad> So it's fucking shit.</s>
<pad> Where are you going?</s>
<pad> It's like taking half a day.</s>
<pad> Imagine PH and win pru-14. Passovers are all kinds of back doors. Last-last Ismail Sabri goes up. That's why I don't give a fuck about politics anymore. I swear I'm up.</s>


In [7]:
strings = [
    'u ni, talk properly lah',
    "just attended my cousin's wedding. pelik jugak dia buat majlis biasa2 je sebab her lifestyle looks lavish. then i found out they're going on a 3 weeks honeymoon. smart decision üëç",
    'Me after seeing this video: mm dapnya burger benjo extra mayo',
    'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:',
]
for s in strings:
    input_ids = tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')
    outputs = model.generate(input_ids, max_length = 100)
    print(tokenizer.decode(outputs[0]))

<pad> u ni, cakap betul lah</s>
<pad> pelik jugak dia buat majlis biasa2 je sebab gaya hidup dia dah mewah...................................................................................
<pad> Selepas menonton video ini: mm dapnya burger benjo extra mayo</s>
<pad> Hai kawan! Saya perhatikan semalam & harini ramai yang dapat cookies ni kan. Jadi harini saya nak kongsi beberapa post mortem kumpulan pertama kami:</s>


In [None]:
model.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased', organization='mesolitica')

In [None]:
tokenizer.push_to_hub('finetune-noisy-translation-t5-base-bahasa-cased', organization='mesolitica')

In [None]:
!cp -r finetune-t5-base-noisy-bahasa-cased/runs finetune-noisy-translation-t5-base-bahasa-cased
!cd finetune-noisy-translation-t5-base-bahasa-cased && git add . && git commit -m 'add tensorboard' && git push

In [8]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [9]:
from unidecode import unidecode
import json

with open('test-noisy-shuffled.json') as fopen:
    test = fopen.read().split('\n')
    test = [json.loads(t) for t in test if len(t)]
    
len(test)

6854

In [10]:
from tqdm import tqdm

batch_size = 1

results_en_ms, filtered_right_en_ms = [], []
results_ms_en, filtered_right_ms_en = [], []
for i in tqdm(range(len(test))):
    t = test[i]['translation']
    p = t['prefix']
    s = t['src']
    tgt = t['tgt']
    
    input_ids = [{'input_ids': tokenizer.encode(f'{p}{s}', return_tensors = 'pt')[0]}]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)[0]
    o = tokenizer.decode(outputs, skip_special_tokens=True)
    if len(o):
        if 'Inggeris ke Melayu' in p:
            results_en_ms.append(o)
            filtered_right_en_ms.append(tgt)
        else:
            results_ms_en.append(o)
            filtered_right_ms_en.append(tgt)

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 6854/6854 [1:10:04<00:00,  1.63it/s]


In [11]:
len(results_en_ms), len(results_ms_en)

(2937, 3917)

In [12]:
refs = [filtered_right_en_ms]
sys = results_en_ms
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 42.16321973536871,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '73.4/50.1/35.7/25.8 (BP = 0.982 ratio = 0.982 hyp_len = 63335 ref_len = 64473)',
  'bp': 0.9821925128801015,
  'counts': [46490, 30266, 20534, 14086],
  'totals': [63335, 60398, 57461, 54524],
  'sys_len': 63335,
  'ref_len': 64473,
  'precisions': [73.40333149127655,
   50.11093082552402,
   35.7355423678669,
   25.834494901327854],
  'prec_str': '73.4/50.1/35.7/25.8',
  'ratio': 0.9823492004404945},
 chrF2++ = 66.51)

In [13]:
refs = [filtered_right_ms_en]
sys = results_ms_en
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 43.432723192596406,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '71.8/49.8/36.6/27.2 (BP = 1.000 ratio = 1.000 hyp_len = 92982 ref_len = 92985)',
  'bp': 0.999967736211266,
  'counts': [66716, 44323, 31152, 22130],
  'totals': [92982, 89065, 85148, 81231],
  'sys_len': 92982,
  'ref_len': 92985,
  'precisions': [71.75152179991827,
   49.76477853253242,
   36.58570958801146,
   27.243293816400143],
  'prec_str': '71.8/49.8/36.6/27.2',
  'ratio': 0.9999677367317309},
 chrF2++ = 65.52)