In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-base-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-base-standard-bahasa-cased/checkpoint-360000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-370000',
 'finetune-t5-base-standard-bahasa-cased/checkpoint-380000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
s = "Kabinet bersetuju mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM)"

In [11]:
input_ids = tokenizer.encode(f'parafrasa: {s}', return_tensors = 'pt')
outputs = model.generate(input_ids, do_sample=True,
    max_length=256,
    top_k=50,
    top_p=0.95,
    early_stopping=True,
    num_return_sequences=5)
tokenizer.batch_decode(outputs, skip_special_tokens = True)

["Kabinet bersetuju mewujudkan jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 "Kabinet bersetuju untuk mewujudkan jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 "Kabinet bersetuju untuk mewujudkan satu jawatan kuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 "Kabinet bersetuju untuk mewujudkan jawatan berkuasa dalaman untuk menyiasat isu 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KPISM).",
 "Kabinet bersetuju untuk mewujudkan jawatan dalaman kuasa yang bertujuan untuk menyiasat 'perbalahan' antara Jabatan Perkhidmatan Awam (JPA) dan Kesatuan Perkhidmatan Imigresen Semenanjung Malaysia (KP

In [12]:
model.push_to_hub('finetune-paraphrase-t5-base-standard-bahasa-cased', organization='mesolitica')



CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-paraphrase-t5-base-standard-bahasa-cased/commit/67a4f28830f1fe2aaf69097200d9663528479192', commit_message='Upload T5ForConditionalGeneration', commit_description='', oid='67a4f28830f1fe2aaf69097200d9663528479192', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
tokenizer.push_to_hub('finetune-paraphrase-t5-base-standard-bahasa-cased', organization='mesolitica')

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-paraphrase-t5-base-standard-bahasa-cased/commit/83df426b9bfd417d29f39055add9f714bcbe1dbd', commit_message='Upload tokenizer', commit_description='', oid='83df426b9bfd417d29f39055add9f714bcbe1dbd', pr_url=None, pr_revision=None, pr_num=None)

In [14]:
from sacrebleu.metrics import BLEU, CHRF, TER

bleu = BLEU()
chrf = CHRF(word_order = 2)

In [15]:
import json

test = []
with open('shuffled-test.json') as fopen:
    for l in fopen:
        test.append(json.loads(l))

In [16]:
from tqdm import tqdm

batch_size = 1

results = []
for i in tqdm(range(0, len(test), batch_size)):
    input_ids = [{'input_ids': tokenizer.encode(f"parafrasa: {s['translation']['src']}", return_tensors = 'pt')[0]} for s in test[i:i + batch_size]]
    padded = tokenizer.pad(input_ids, padding = 'longest')
    outputs = model.generate(**padded, max_length = 256)
    for o in outputs:
        results.append(tokenizer.decode(o, skip_special_tokens=True))

100%|███████████████████████████████████████| 5544/5544 [51:02<00:00,  1.81it/s]


In [17]:
filtered_left, filtered_right = [], []
for no, r in enumerate(results):
    if len(r):
        filtered_left.append(r)
        filtered_right.append(test[no]['translation']['tgt'])

In [18]:
refs = [filtered_right]
sys = filtered_left
r = bleu.corpus_score(sys, refs)
r.__dict__

{'name': 'BLEU',
 'score': 35.95965899952292,
 '_mean': -1.0,
 '_ci': -1.0,
 '_verbose': '61.7/41.3/32.0/25.8 (BP = 0.944 ratio = 0.946 hyp_len = 95593 ref_len = 101064)',
 'bp': 0.9443747373110852,
 'counts': [59014, 37157, 27016, 20383],
 'totals': [95593, 90049, 84505, 78961],
 'sys_len': 95593,
 'ref_len': 101064,
 'precisions': [61.73464584226878,
  41.263090095392506,
  31.969705934560086,
  25.81400944770203],
 'prec_str': '61.7/41.3/32.0/25.8',
 'ratio': 0.9458659859099184}