In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased-austronesian/checkpoint-*'))
checkpoints

['finetune-t5-tiny-standard-bahasa-cased-austronesian/checkpoint-1070000',
 'finetune-t5-tiny-standard-bahasa-cased-austronesian/checkpoint-1080000',
 'finetune-t5-tiny-standard-bahasa-cased-austronesian/checkpoint-1090000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
input_ids = tokenizer.encode('terjemah Melayu ke Indonesia: Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Hi orang! Saya melihat kemarin & hari ini banyak yang mendapatkan cookies ini. Jadi hari ini saya ingin membagikan beberapa post mortem dari batch pertama kami:</s>


In [6]:
input_ids = tokenizer.encode('terjemah Melayu ke Banjar: saya tak suka ikan keli dan ayam goreng', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Aku ora seneng iwak keli lan ayam goreng</s>


In [7]:
input_ids = tokenizer.encode('terjemah Melayu ke Jawa: saya tak suka ikan keli dan ayam goreng', return_tensors = 'pt')
outputs = model.generate(input_ids, max_length = 100)
print(tokenizer.decode(outputs[0]))

<pad> Aku ora seneng iwak keli lan pitik goreng</s>


In [8]:
model.push_to_hub('finetune-translation-austronesian-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-tiny-standard-bahasa-cased into local empty directory.


Upload file pytorch_model.bin:   0%|          | 4.00k/133M [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-tiny-standard-bahasa-cased
   2c440d1..dbe3c37  main -> main



'https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-tiny-standard-bahasa-cased/commit/dbe3c37725d322e590fa0777b14640b9fe71bcc3'

In [9]:
tokenizer.push_to_hub('finetune-translation-austronesian-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Upload file spiece.model:   1%|          | 4.00k/784k [00:00<?, ?B/s]

remote: Scanning LFS files for validity...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-tiny-standard-bahasa-cased
   dbe3c37..274db26  main -> main



'https://huggingface.co/mesolitica/finetune-translation-austronesian-t5-tiny-standard-bahasa-cased/commit/274db2662ec3606b8537af2c30aae1e2ac7700a6'

In [10]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [11]:
from unidecode import unidecode

with open('ind_Latn.dev') as fopen:
    ind = fopen.read().split('\n')[:-1]

with open('jav_Latn.dev') as fopen:
    jav = fopen.read().split('\n')[:-1]
    
with open('zsm_Latn.dev') as fopen:
    ms = fopen.read().split('\n')[:-1]

In [13]:
ind = [unidecode(s) for s in ind]
jav = [unidecode(s) for s in jav]
ms = [unidecode(s) for s in ms]

In [12]:
_ = model.cuda()

In [14]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(ind), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Indonesia ke Melayu: {s}', return_tensors = 'pt')[0]} for s in ind[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    for k in padded:
        padded[k] = padded[k].cuda()
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 997/997 [01:27<00:00, 11.44it/s]


In [15]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(ms[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 30.277470707798773,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '64.2/38.0/24.1/15.6 (BP = 0.978 ratio = 0.978 hyp_len = 21542 ref_len = 22027)',
  'bp': 0.9777373939096933,
  'counts': [13823, 7816, 4717, 2896],
  'totals': [21542, 20545, 19548, 18551],
  'sys_len': 21542,
  'ref_len': 22027,
  'precisions': [64.16767245381116,
   38.043319542467756,
   24.13034581542869,
   15.611018273947495],
  'prec_str': '64.2/38.0/24.1/15.6',
  'ratio': 0.9779815680755437},
 chrF2++ = 57.38)

In [16]:
results = []
for i in tqdm(range(0, len(ms), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Indonesia: {s}', return_tensors = 'pt')[0]} for s in ms[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    for k in padded:
        padded[k] = padded[k].cuda()
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 997/997 [01:33<00:00, 10.64it/s]


In [17]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(ind[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 33.88207737320432,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '67.7/42.2/28.0/18.9 (BP = 0.966 ratio = 0.966 hyp_len = 21116 ref_len = 21856)',
  'bp': 0.9655624323170151,
  'counts': [14293, 8499, 5358, 3430],
  'totals': [21116, 20119, 19122, 18125],
  'sys_len': 21116,
  'ref_len': 21856,
  'precisions': [67.68800909263118,
   42.243650280829065,
   28.020081581424538,
   18.924137931034483],
  'prec_str': '67.7/42.2/28.0/18.9',
  'ratio': 0.9661420204978038},
 chrF2++ = 59.46)

In [18]:
results = []
for i in tqdm(range(0, len(ms), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Melayu ke Jawa: {s}', return_tensors = 'pt')[0]} for s in ms[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    for k in padded:
        padded[k] = padded[k].cuda()
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 997/997 [02:31<00:00,  6.57it/s]


In [19]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(jav[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 23.79649854962551,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '59.2/31.6/18.2/10.8 (BP = 0.966 ratio = 0.967 hyp_len = 20886 ref_len = 21609)',
  'bp': 0.9659758070793919,
  'counts': [12374, 6294, 3432, 1935],
  'totals': [20886, 19889, 18892, 17895],
  'sys_len': 20886,
  'ref_len': 21609,
  'precisions': [59.24542755913052,
   31.645633264618635,
   18.16641964852848,
   10.813076278290024],
  'prec_str': '59.2/31.6/18.2/10.8',
  'ratio': 0.9665417187283076},
 chrF2++ = 51.21)

In [20]:
results = []
for i in tqdm(range(0, len(ms), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f'terjemah Jawa ke Melayu: {s}', return_tensors = 'pt')[0]} for s in jav[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    for k in padded:
        padded[k] = padded[k].cuda()
    outputs = model.generate(**padded, max_length = 512)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 997/997 [01:31<00:00, 10.94it/s]


In [21]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(ms[no])
        
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__, chrf.corpus_score(sys, refs)

({'name': 'BLEU',
  'score': 23.797627841793492,
  '_mean': -1.0,
  '_ci': -1.0,
  '_verbose': '58.2/31.1/18.1/10.8 (BP = 0.977 ratio = 0.977 hyp_len = 21521 ref_len = 22027)',
  'bp': 0.9767623329685766,
  'counts': [12516, 6383, 3528, 1998],
  'totals': [21521, 20524, 19527, 18530],
  'sys_len': 21521,
  'ref_len': 22027,
  'precisions': [58.15714883137401,
   31.1001754044046,
   18.067291442617915,
   10.782514840798704],
  'prec_str': '58.2/31.1/18.1/10.8',
  'ratio': 0.9770281926726291},
 chrF2++ = 50.65)