In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-base-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-base-standard-bahasa-cased/checkpoint-3440000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3450000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3460000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3470000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3480000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3490000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3500000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3510000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3520000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-3530000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [4]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai kawan! Saya perhatikan semalam & harini sudah ramai yang dapat kuki ini. Jadi harini saya ingin berkongsi beberapa bedah siasat kumpulan pertama kami:</s>


In [22]:
model.push_to_hub('finetune-translation-t5-base-standard-bahasa-cased', organization='mesolitica')

Upload file pytorch_model.bin:   0%|          | 4.00k/850M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-t5-base-standard-bahasa-cased
   5ef9f27..c3a00e6  main -> main



'https://huggingface.co/mesolitica/finetune-translation-t5-base-standard-bahasa-cased/commit/c3a00e6f49332caad1d46a707d41cdaebbacefda'

In [23]:
tokenizer.push_to_hub('finetune-translation-t5-base-standard-bahasa-cased', organization='mesolitica')

In [6]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [7]:
from unidecode import unidecode

with open('eng_Latn.dev') as fopen:
    eng = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]
    
right = [unidecode(s) for s in ms]
left = [unidecode(s) for s in eng]

In [8]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 997/997 [07:48<00:00,  2.13it/s]


In [20]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [10]:
refs = [filtered_right]
sys = filtered_left

In [11]:
r = bleu.corpus_score(sys, refs)

In [12]:
r.__dict__

{'name': 'BLEU',
 'score': 44.17355862158963,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '74.7/52.3/38.0/28.0 (BP = 0.979 ratio = 0.979 hyp_len = 21569 ref_len = 22027)',
 'bp': 0.9789896792105062,
 'counts': [16102, 10752, 7434, 5197],
 'totals': [21569, 20572, 19575, 18578],
 'sys_len': 21569,
 'ref_len': 22027,
 'precisions': [74.65343780425611,
  52.265214855142915,
  37.97701149425287,
  27.973947680051673],
 'prec_str': '74.7/52.3/38.0/28.0',
 'ratio': 0.9792073364507196}

In [13]:
chrf.corpus_score(sys, refs)

chrF2++ = 67.60

In [14]:
left = [unidecode(s) for s in ms]
right = [unidecode(s) for s in eng]

In [15]:
batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 997/997 [08:22<00:00,  1.99it/s]


In [16]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [17]:
refs = [filtered_right]
sys = filtered_left

In [18]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 43.40885318934906,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '72.3/50.5/37.1/27.7 (BP = 0.987 ratio = 0.987 hyp_len = 23258 ref_len = 23570)',
 'bp': 0.9866748376005395,
 'counts': [16813, 11235, 7894, 5606],
 'totals': [23258, 22261, 21264, 20267],
 'sys_len': 23258,
 'ref_len': 23570,
 'precisions': [72.28910482414653,
  50.46943084317866,
  37.12377727614748,
  27.66072926432131],
 'prec_str': '72.3/50.5/37.1/27.7',
 'ratio': 0.9867628341111583}

In [19]:
chrf.corpus_score(sys, refs)

chrF2++ = 65.44