In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-tiny-standard-bahasa-cased/checkpoint-1760000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-1770000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-1780000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
s = "Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM)"

In [6]:
input_ids = tokenizer.encode(f'parafrasa: {s}', return_tensors = 'pt')
outputs = model.generate(input_ids, do_sample=True,
    max_length=256,
    top_k=50,
    top_p=0.95,
    early_stopping=True,
    num_return_sequences=5)

In [7]:
tokenizer.batch_decode(outputs, skip_special_tokens = True)

["Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'perbalahan antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 'Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu "perbalahan" antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).',
 "Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 'Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu-isu di kalangan Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).',
 'Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu "perbalahan" antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).']

In [8]:
model.push_to_hub('finetune-paraphrase-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-paraphrase-t5-tiny-standard-bahasa-cased into local empty directory.


Download file pytorch_model.bin:   0%|          | 8.00k/133M [00:00<?, ?B/s]

Download file spiece.model:   2%|1         | 15.4k/784k [00:00<?, ?B/s]

Clean file spiece.model:   0%|          | 1.00k/784k [00:00<?, ?B/s]

Clean file pytorch_model.bin:   0%|          | 1.00k/133M [00:00<?, ?B/s]

Upload file pytorch_model.bin:   0%|          | 4.00k/133M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-paraphrase-t5-tiny-standard-bahasa-cased
   9f0f5a5..b447613  main -> main



'https://huggingface.co/mesolitica/finetune-paraphrase-t5-tiny-standard-bahasa-cased/commit/b4476131a296bc1f24e2ae26e9a577406f9ad10c'

In [9]:
tokenizer.push_to_hub('finetune-paraphrase-t5-tiny-standard-bahasa-cased', organization='mesolitica')

In [10]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [11]:
import json

test = []
with open('shuffled-test.json') as fopen:
    for l in fopen:
        test.append(json.loads(l))

In [14]:
len(test)

5544

In [15]:
test[0]['translation']['src']

'Antara aplikasi bagi sistem atom sejuk yang terperangkap sedemikian ialah simulasi kuantum sistem jirim berkondensasi Banyak Parti dalam potensi berkala.'

In [16]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(test), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f"parafrasa: {s['translation']['src']}", return_tensors = 'pt')[0]} for s in test[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 1000)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|████████████████████████████████████████████████████████████████████████████████████████████| 5544/5544 [10:43<00:00,  8.62it/s]


In [17]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(test[no]['translation']['tgt'])

In [18]:
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 36.92696648298233,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '62.5/42.3/33.0/26.9 (BP = 0.943 ratio = 0.945 hyp_len = 95496 ref_len = 101064)',
 'bp': 0.9433611337299734,
 'counts': [59650, 38055, 27875, 21217],
 'totals': [95496, 89952, 84408, 78864],
 'sys_len': 95496,
 'ref_len': 101064,
 'precisions': [62.46334925023038,
  42.30589647812167,
  33.02412093640413,
  26.90327652667884],
 'prec_str': '62.5/42.3/33.0/26.9',
 'ratio': 0.944906198052719}