In [3]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [4]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1590000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1600000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1610000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1620000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1630000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1640000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1650000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1660000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1670000',
 'finetune-t5-super-tiny-standard-bahasa-cased/checkpoint-1680000']

In [5]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [6]:
input_ids = tokenizer.encode('terjemah Inggeris ke Melayu: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hai guys! Saya perhatikan semalam & harini banyak yang dapat kuki ini. Jadi hari ini saya ingin berkongsi beberapa mortem kumpulan pertama kami:</s>


In [7]:
model.push_to_hub('finetune-translation-t5-super-tiny-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/48.4M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased
   3c78d0d..89cbc34  main -> main



'https://huggingface.co/mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased/commit/89cbc349e8f5de8b9671af76d0a84c276de9c844'

In [8]:
tokenizer.push_to_hub('finetune-translation-t5-super-tiny-standard-bahasa-cased', organization='mesolitica')

Upload file spiece.model:   4%|4         | 32.0k/784k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased
   89cbc34..03663f3  main -> main



'https://huggingface.co/mesolitica/finetune-translation-t5-super-tiny-standard-bahasa-cased/commit/03663f3af9716b6c5df3c062bed567f82129d815'

In [9]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [10]:
from unidecode import unidecode

with open('eng_Latn.dev') as fopen:
    eng = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]
    
right = [unidecode(s) for s in ms]
left = [unidecode(s) for s in eng]

In [11]:
right[-1], left[-1]

('Dalam semua kes, anda mesti menempah melalui telefon secara terus dengan syarikat penerbangan itu.',
 'In all cases, you must book by phone directly with the airline.')

In [12]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Inggeris ke Melayu: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████| 997/997 [00:54<00:00, 18.13it/s]


In [13]:
len(results)

997

In [14]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [15]:
refs = [filtered_right]
sys = filtered_left

In [16]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 39.18834189893951,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '72.6/48.3/33.5/23.6 (BP = 0.960 ratio = 0.961 hyp_len = 21172 ref_len = 22027)',
 'bp': 0.9604210226409274,
 'counts': [15376, 9741, 6434, 4284],
 'totals': [21172, 20175, 19178, 18181],
 'sys_len': 21172,
 'ref_len': 22027,
 'precisions': [72.62422066880787,
  48.28252788104089,
  33.54885806653457,
  23.563060337715196],
 'prec_str': '72.6/48.3/33.5/23.6',
 'ratio': 0.9611840014527625}

In [17]:
chrf.corpus_score(sys, refs)

chrF2++ = 64.03

In [18]:
left = [unidecode(s) for s in ms]
right = [unidecode(s) for s in eng]

In [19]:
batch_size = 1

results = []
for i in tqdm(range(0, len(left), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Inggeris: {s}', return_tensors = 'pt')[0]} for s in left[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████| 997/997 [00:58<00:00, 17.11it/s]


In [20]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(right[no])

In [21]:
refs = [filtered_right]
sys = filtered_left

In [22]:
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 34.10561487832948,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '67.3/41.6/27.8/18.7 (BP = 0.982 ratio = 0.982 hyp_len = 23139 ref_len = 23570)',
 'bp': 0.9815458410942027,
 'counts': [15569, 9216, 5871, 3777],
 'totals': [23139, 22142, 21145, 20148],
 'sys_len': 23139,
 'ref_len': 23570,
 'precisions': [67.28467090194044,
  41.62225634540692,
  27.765429179475053,
  18.746277546158428],
 'prec_str': '67.3/41.6/27.8/18.7',
 'ratio': 0.9817140432753501}

In [23]:
chrf.corpus_score(sys, refs)

chrF2++ = 59.18