In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-tiny-standard-bahasa-cased/checkpoint-550000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-560000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-570000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-580000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-590000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-600000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-610000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-620000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-630000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-640000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [4]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai lelaki! Saya perhatikan semalam & hari ini banyak yang dapat cookies kan. Jadi hari ini saya ingin berkongsi beberapa post mortem kumpulan pertama kami:</s>


In [5]:
model.push_to_hub('finetune-translation-t5-tiny-standard-bahasa-cased', organization='mesolitica')

In [6]:
tokenizer.push_to_hub('finetune-translation-t5-tiny-standard-bahasa-cased', organization='mesolitica')

In [7]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [8]:
from unidecode import unidecode

with open('eng_Latn.dev') as fopen:
    eng = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]
    
right = [unidecode(s) for s in ms]
left = [unidecode(s) for s in eng]

In [9]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 997/997 [02:06<00:00,  7.85it/s]


In [10]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [11]:
refs = [filtered_right]
sys = filtered_left

In [12]:
r = bleu.corpus_score(sys, refs)

In [13]:
r.__dict__

{'name': 'BLEU',
 'score': 41.625536185056305,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '73.4/50.1/35.7/25.7 (BP = 0.971 ratio = 0.972 hyp_len = 21400 ref_len = 22027)',
 'bp': 0.9711259908305946,
 'counts': [15718, 10223, 6926, 4731],
 'totals': [21400, 20403, 19406, 18409],
 'sys_len': 21400,
 'ref_len': 22027,
 'precisions': [73.44859813084112,
  50.10537666029506,
  35.68999278573637,
  25.699386169808246],
 'prec_str': '73.4/50.1/35.7/25.7',
 'ratio': 0.9715349343986925}

In [14]:
chrf.corpus_score(sys, refs)

chrF2++ = 65.70

In [15]:
with open('eng_Latn.dev') as fopen:
    eng = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]
    
left = [unidecode(s) for s in ms]
right = [unidecode(s) for s in eng]

In [16]:
batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 997/997 [02:20<00:00,  7.10it/s]


In [17]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [18]:
refs = [filtered_right]
sys = filtered_left

In [19]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 37.26048464066508,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '68.3/44.1/30.5/21.4 (BP = 0.995 ratio = 0.995 hyp_len = 23457 ref_len = 23570)',
 'bp': 0.9951942593830536,
 'counts': [16020, 9908, 6547, 4376],
 'totals': [23457, 22460, 21463, 20466],
 'sys_len': 23457,
 'ref_len': 23570,
 'precisions': [68.29517841156158,
  44.1139804096171,
  30.503657457019056,
  21.381803967555946],
 'prec_str': '68.3/44.1/30.5/21.4',
 'ratio': 0.9952057700466695}

In [20]:
chrf.corpus_score(sys, refs)

chrF2++ = 61.29