In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [10]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-small-standard-bahasa-cased/checkpoint-310000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-320000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-330000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-340000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-350000']

In [None]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [16]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[1])

In [4]:
s = "Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM)"

In [17]:
input_ids = tokenizer.encode(f'parafrasa: {s}', return_tensors = 'pt')
outputs = model.generate(input_ids, do_sample=True,
    max_length=256,
    top_k=50,
    top_p=0.95,
    early_stopping=True,
    num_return_sequences=5)

In [18]:
tokenizer.batch_decode(outputs, skip_special_tokens = True)

['Kabinet bersetuju untuk mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu "balahan" antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).',
 'Kabinet bersetuju untuk mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu perpecahan antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).',
 "Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'kebersihan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 'Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu "bodoh" antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).',
 'Kabinet bersetuju menubuhkan satu jawatan kuasa dalaman yang bertindak sebagai pentadbir untuk menyiasat isu yang merosakkan antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Seme

In [8]:
model.push_to_hub('finetune-paraphrase-t5-small-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-paraphrase-t5-small-standard-bahasa-cased into local empty directory.


Download file pytorch_model.bin:   0%|          | 3.19k/231M [00:00<?, ?B/s]

Download file spiece.model:   0%|          | 1.44k/784k [00:00<?, ?B/s]

Clean file spiece.model:   0%|          | 1.00k/784k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/231M [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 4.00k/231M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-paraphrase-t5-small-standard-bahasa-cased
   41e211e..26b7166  main -> main



'https://huggingface.co/mesolitica/finetune-paraphrase-t5-small-standard-bahasa-cased/commit/26b7166fb743b27452e34e79d0e319b50f84167e'

In [9]:
tokenizer.push_to_hub('finetune-paraphrase-t5-small-standard-bahasa-cased', organization='mesolitica')

In [19]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [22]:
import json

test = []
with open('shuffled-test.json') as fopen:
    for l in fopen:
        test.append(json.loads(l))

In [24]:
len(test)

5544

In [25]:
test[0]['translation']['src']

'Antara aplikasi bagi sistem atom sejuk yang terperangkap sedemikian ialah simulasi kuantum sistem jirim berkondensasi Banyak Parti dalam potensi berkala.'

In [27]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(test), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f"parafrasa: {s['translation']['src']}", return_tensors = 'pt')[0]} for s in test[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 256)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 5544/5544 [17:23<00:00,  5.31it/s]


In [30]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(test[no]['translation']['tgt'])

In [31]:
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 37.598729045833316,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '62.6/42.5/33.2/27.0 (BP = 0.957 ratio = 0.958 hyp_len = 96781 ref_len = 101064)',
 'bp': 0.9567103919247614,
 'counts': [60539, 38753, 28443, 21680],
 'totals': [96781, 91237, 85693, 80149],
 'sys_len': 96781,
 'ref_len': 101064,
 'precisions': [62.55256713611143,
  42.47509234192268,
  33.19174261608299,
  27.049620082596164],
 'prec_str': '62.6/42.5/33.2/27.0',
 'ratio': 0.9576209134805668}