In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased-austronesian/checkpoint-*'))
checkpoints

['finetune-t5-small-standard-bahasa-cased-austronesian/checkpoint-910000',
 'finetune-t5-small-standard-bahasa-cased-austronesian/checkpoint-920000',
 'finetune-t5-small-standard-bahasa-cased-austronesian/checkpoint-930000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
input_ids = tokenizer.encode('terjemah Melayu ke Indonesia: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100, temperature = 0.5)
print(tokenizer.decode(outputs[0]))

<pad> Hai orang! Saya perhatikan kemarin & hari ini banyak yang mendapatkan cookie ini, bukan. Jadi hari ini saya ingin membagikan beberapa post mortem dari kelompok pertama kami:</s>


In [6]:
input_ids = tokenizer.encode('terjemah Melayu ke Jawa: saya tak suka ikan keli dan ayam goreng', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Aku ora seneng lele lan pitik goreng</s>


In [7]:
input_ids = tokenizer.encode('terjemah Melayu ke Indonesia: saya tak suka ikan keli dan ayam goreng', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Saya tidak suka lele dan ayam goreng</s>


In [8]:
input_ids = tokenizer.encode('terjemah Indonesia ke Melayu: Hai orang! Saya perhatikan kemarin & hari ini banyak yang mendapatkan cookie ini. Jadi hari ini saya ingin berbagi beberapa post mortem dari kelompok pertama kami:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Wahai orang! Saya perhatikan semalam & hari ini ramai yang dapat kuih ini. Jadi hari ini saya ingin berkongsi beberapa post mortem dari kumpulan pertama kami:</s>


In [9]:
model.push_to_hub('finetune-translation-austronesian-t5-small-standard-bahasa-cased', organization='mesolitica')



CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-small-standard-bahasa-cased/commit/11d47b0fe8634ba967e9c82230b52fec72617eeb', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='11d47b0fe8634ba967e9c82230b52fec72617eeb', pr_url=None, pr_revision=None, pr_num=None)

In [10]:
tokenizer.push_to_hub('finetune-translation-austronesian-t5-small-standard-bahasa-cased', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-small-standard-bahasa-cased/commit/0f7c951b017dbca0d0d5429c4f9fd2b94cfd3423', commit_message='Upload tokenizer', commit_description='', oid='0f7c951b017dbca0d0d5429c4f9fd2b94cfd3423', pr_url=None, pr_revision=None, pr_num=None)

In [11]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [12]:
from unidecode import unidecode

with open('ind_Latn.dev') as fopen:
    ind = fopen.read().split('\n')[:-1]

with open('jav_Latn.dev') as fopen:
    jav = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]

In [13]:
ind = [unidecode(s) for s in ind]
jav = [unidecode(s) for s in jav]
ms = [unidecode(s) for s in ms]

In [31]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(ind), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Indonesia ke Melayu: {s}', return_tensors = 'pt')[0]} for s in ind[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████| 997/997 [04:13<00:00,  3.93it/s]


In [32]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(ms[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 30.24358980824753,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '61.1/36.9/23.8/15.6 (BP = 1.000 ratio = 1.052 hyp_len = 23174 ref_len = 22027)',
  'bp': 1.0,
  'counts': [14159, 8189, 5042, 3144],
  'totals': [23174, 22177, 21180, 20183],
  'sys_len': 23174,
  'ref_len': 22027,
  'precisions': [61.098645033226894,
   36.925643684898766,
   23.80547686496695,
   15.577466184412625],
  'prec_str': '61.1/36.9/23.8/15.6',
  'ratio': 1.0520724565306214},
 chrF2++ = 58.43)

In [33]:
results = []
for i in tqdm(range(0, len(ms), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Indonesia: {s}', return_tensors = 'pt')[0]} for s in ms[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████| 997/997 [04:19<00:00,  3.85it/s]


In [34]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(ind[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 35.95448072225675,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '66.3/42.6/29.1/20.3 (BP = 1.000 ratio = 1.014 hyp_len = 22164 ref_len = 21856)',
  'bp': 1.0,
  'counts': [14691, 9018, 5871, 3898],
  'totals': [22164, 21167, 20170, 19173],
  'sys_len': 22164,
  'ref_len': 21856,
  'precisions': [66.28316188413643,
   42.60405347947277,
   29.10758552305404,
   20.330673342721536],
  'prec_str': '66.3/42.6/29.1/20.3',
  'ratio': 1.0140922401171304},
 chrF2++ = 61.02)

In [35]:
results = []
for i in tqdm(range(0, len(ms), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Jawa: {s}', return_tensors = 'pt')[0]} for s in ms[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████| 997/997 [06:41<00:00,  2.48it/s]


In [36]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(jav[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 24.599989427145964,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '58.3/31.6/18.5/11.2 (BP = 0.990 ratio = 0.990 hyp_len = 21391 ref_len = 21609)',
  'bp': 0.9898605524286408,
  'counts': [12475, 6440, 3580, 2065],
  'totals': [21391, 20394, 19397, 18400],
  'sys_len': 21391,
  'ref_len': 21609,
  'precisions': [58.31891917161423,
   31.577915073060705,
   18.45646233953704,
   11.222826086956522],
  'prec_str': '58.3/31.6/18.5/11.2',
  'ratio': 0.9899116109028645},
 chrF2++ = 51.65)

In [37]:
results = []
for i in tqdm(range(0, len(ms), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Jawa ke Melayu: {s}', return_tensors = 'pt')[0]} for s in jav[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|█████████████████████████████████████████| 997/997 [04:14<00:00,  3.92it/s]


In [38]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(ms[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 25.24437731940083,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '57.7/31.9/19.0/11.6 (BP = 1.000 ratio = 1.022 hyp_len = 22516 ref_len = 22027)',
  'bp': 1.0,
  'counts': [12999, 6872, 3909, 2258],
  'totals': [22516, 21519, 20522, 19525],
  'sys_len': 22516,
  'ref_len': 22027,
  'precisions': [57.732279268076034,
   31.934569450253264,
   19.04785108663873,
   11.564660691421254],
  'prec_str': '57.7/31.9/19.0/11.6',
  'ratio': 1.0222000272392973},
 chrF2++ = 52.58)