In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-tiny-standard-bahasa-cased/checkpoint-400000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-410000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-420000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
string1 = 'jom makan di us makanan di sana sedap'
string2 = 'kuala lumpur menteri di jabatan perdana menteri datuk seri dr mujahid yusof rawa hari ini mengakhiri lawatan kerja lapan hari ke jordan turki dan bosnia herzegovina lawatan yang bertujuan mengeratkan lagi hubungan dua hala dengan ketiga tiga negara berkenaan'

In [6]:
strings = [
    string1,
    string2,
]

In [7]:
input_ids = [{'input_ids': tokenizer.encode(f'kes benar: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length=256)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Jom makan di US makanan di sana sedap',
 'KUALA LUMPUR: Menteri di Jabatan Perdana Menteri, Datuk Seri Dr Mujahid Yusof Rawa hari ini mengakhiri lawatan kerja lapan hari ke Jordan Turki dan Bosnia Herzegovina, lawatan yang bertujuan mengeratkan lagi hubungan dua hala dengan ketiga-tiga negara berkenaan.']

In [9]:
model.push_to_hub('finetune-true-case-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/133M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased
   9327c67..5e9dc53  main -> main



'https://huggingface.co/mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased/commit/5e9dc538cbdaef51544fa6df82bdfad2dfd45e41'

In [10]:
tokenizer.push_to_hub('finetune-true-case-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Upload file spiece.model:   4%|4         | 32.0k/784k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased
   5e9dc53..21ae7b7  main -> main



'https://huggingface.co/mesolitica/finetune-true-case-t5-tiny-standard-bahasa-cased/commit/21ae7b789e801f04d7ed4868e9f13f8b42f54897'

In [8]:
import json

In [9]:
with open('test-set-true-case.json') as fopen:
    data = json.load(fopen)

In [10]:
data[0]

['Format Terbuka. Format Terbuka Ialah Suatu Format Fail Untuk Tujuan Menyimpan Data Digital Di Mana Format Ini Ditakrifkan Berdasarkan Spesifikasi Yang Diterbitkan Dan Dikendalikan Pertubuhan Piawaian Serta Boleh Digunapakai Khalayak Ramai.',
 'Format terbuka. Format terbuka ialah suatu format fail untuk tujuan menyimpan data digital, di mana format ini ditakrifkan berdasarkan spesifikasi yang diterbitkan dan dikendalikan pertubuhan piawaian, serta boleh digunapakai khalayak ramai.']

In [11]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)

def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [None]:
from tqdm import tqdm

wer, cer = [], []
for i in tqdm(range(len(data))):
    input_ids = [{'input_ids': tokenizer.encode(f'kes benar: {data[i][0]}', return_tensors='pt')[0]}]
    padded = tokenizer.pad(input_ids, padding='longest')
    outputs = model.generate(**padded, max_length=256)
    predicted = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    actual = data[i][1]
    wer.append(calculate_wer(actual, predicted))
    cer.append(calculate_cer(actual, predicted))

 88%|██████████████████████████████████████████████████████████████████████████████████▊           | 28535/32404 [1:36:20<03:49, 16.86it/s]

In [14]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.09675517381487253, 0.020109968380290597)