In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-small-standard-bahasa-cased/checkpoint-2280000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2290000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2300000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2310000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2320000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2330000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2340000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2350000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2360000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-2370000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [4]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai kawan-kawan! Saya perhatikan semalam & harini dah banyak yang dapat kuki ni kan. Jadi harini saya ingin berkongsi beberapa post mortem kumpulan pertama kami:</s>


In [5]:
model.push_to_hub('finetune-translation-t5-small-standard-bahasa-cased', organization='mesolitica')

Upload file pytorch_model.bin:   0%|          | 32.0k/231M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-t5-small-standard-bahasa-cased
   acbad49..e5bec7c  main -> main



'https://huggingface.co/mesolitica/finetune-translation-t5-small-standard-bahasa-cased/commit/e5bec7c903bfb0d39848893466f6c78debf2b4df'

In [6]:
tokenizer.push_to_hub('finetune-translation-t5-small-standard-bahasa-cased', organization='mesolitica')

In [5]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [6]:
from unidecode import unidecode

with open('eng_Latn.dev') as fopen:
    eng = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]
    
right = [unidecode(s) for s in ms]
left = [unidecode(s) for s in eng]

In [7]:
right[-1], left[-1]

('Dalam semua kes, anda mesti menempah melalui telefon secara terus dengan syarikat penerbangan itu.',
 'In all cases, you must book by phone directly with the airline.')

In [8]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 997/997 [03:20<00:00,  4.98it/s]


In [11]:
len(results)

997

In [12]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [13]:
refs = [filtered_right]
sys = filtered_left

In [14]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 43.93729753370648,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '74.9/52.2/37.9/27.7 (BP = 0.976 ratio = 0.977 hyp_len = 21510 ref_len = 22027)',
 'bp': 0.9762512158466284,
 'counts': [16101, 10712, 7389, 5134],
 'totals': [21510, 20513, 19516, 18519],
 'sys_len': 21510,
 'ref_len': 22027,
 'precisions': [74.85355648535565,
  52.220543070248134,
  37.86124205779873,
  27.722879205140668],
 'prec_str': '74.9/52.2/37.9/27.7',
 'ratio': 0.9765288055568166}

In [15]:
chrf.corpus_score(sys, refs)

chrF2++ = 67.43

In [16]:
left = [unidecode(s) for s in ms]
right = [unidecode(s) for s in eng]

In [17]:
batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████████| 997/997 [03:55<00:00,  4.24it/s]


In [22]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [23]:
refs = [filtered_right]
sys = filtered_left

In [24]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 42.01021763049599,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '71.7/49.0/35.6/26.1 (BP = 0.989 ratio = 0.989 hyp_len = 23302 ref_len = 23570)',
 'bp': 0.988564726798463,
 'counts': [16700, 10937, 7587, 5294],
 'totals': [23302, 22305, 21308, 20311],
 'sys_len': 23302,
 'ref_len': 23570,
 'precisions': [71.6676680113295,
  49.03384891279982,
  35.60634503472874,
  26.06469400817291],
 'prec_str': '71.7/49.0/35.6/26.1',
 'ratio': 0.9886296139159949}

In [25]:
chrf.corpus_score(sys, refs)

chrF2++ = 64.67