In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1330000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1340000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1350000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1360000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1370000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1380000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1390000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1400000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1410000',
 'finetune-t5-super-super-tiny-standard-bahasa-cased/checkpoint-1420000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [4]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai guys! Saya perhatikan semalam & hari ini, saya mahu berkongsi beberapa mortem batch pertama kami:</s>


In [5]:
model.push_to_hub('finetune-translation-t5-super-super-tiny-standard-bahasa-cased', organization='mesolitica')

Upload file pytorch_model.bin:   0%|          | 4.00k/22.2M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased
   8e1e2ea..08b1815  main -> main



'https://huggingface.co/mesolitica/finetune-translation-t5-super-super-tiny-standard-bahasa-cased/commit/08b18150cdbe288282c86998a8edfa368eb81fda'

In [6]:
tokenizer.push_to_hub('finetune-translation-t5-super-super-tiny-standard-bahasa-cased', organization='mesolitica')

In [7]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [8]:
from unidecode import unidecode

with open('eng_Latn.dev') as fopen:
    eng = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]
    
right = [unidecode(s) for s in ms]
left = [unidecode(s) for s in eng]

In [9]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 997/997 [00:45<00:00, 21.92it/s]


In [10]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [11]:
refs = [filtered_right]
sys = filtered_left

In [12]:
r = bleu.corpus_score(sys, refs)

In [13]:
r.__dict__

{'name': 'BLEU',
 'score': 36.29074311583665,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '71.2/46.0/30.9/21.0 (BP = 0.950 ratio = 0.951 hyp_len = 20958 ref_len = 22027)',
 'bp': 0.9502722319832295,
 'counts': [14919, 9178, 5858, 3780],
 'totals': [20958, 19961, 18964, 17967],
 'sys_len': 20958,
 'ref_len': 22027,
 'precisions': [71.18522759805325,
  45.97966033765844,
  30.890107572242144,
  21.038570712973787],
 'prec_str': '71.2/46.0/30.9/21.0',
 'ratio': 0.9514686521087756}

In [14]:
chrf.corpus_score(sys, refs)

chrF2++ = 61.89

In [15]:
left = [unidecode(s) for s in ms]
right = [unidecode(s) for s in eng]

In [16]:
batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████████████████████████████████████████████████████| 997/997 [00:53<00:00, 18.75it/s]


In [17]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [18]:
refs = [filtered_right]
sys = filtered_left

In [19]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 30.216143755278946,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '64.9/38.1/24.1/15.3 (BP = 0.978 ratio = 0.978 hyp_len = 23057 ref_len = 23570)',
 'bp': 0.9779964796601237,
 'counts': [14963, 8410, 5082, 3063],
 'totals': [23057, 22060, 21063, 20066],
 'sys_len': 23057,
 'ref_len': 23570,
 'precisions': [64.89569328186668,
  38.12330009066183,
  24.127617148554336,
  15.264626731785109],
 'prec_str': '64.9/38.1/24.1/15.3',
 'ratio': 0.9782350445481545}

In [20]:
chrf.corpus_score(sys, refs)

chrF2++ = 56.46