In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-small-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-small-standard-bahasa-cased/checkpoint-380000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-390000',
 'finetune-t5-small-standard-bahasa-cased/checkpoint-400000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
string1 = 'jom makan di us makanan di sana sedap'
string2 = 'kuala lumpur menteri di jabatan perdana menteri datuk seri dr mujahid yusof rawa hari ini mengakhiri lawatan kerja lapan hari ke jordan turki dan bosnia herzegovina lawatan yang bertujuan mengeratkan lagi hubungan dua hala dengan ketiga tiga negara berkenaan'

In [6]:
strings = [
    string1,
    string2,
]

In [7]:
input_ids = [{'input_ids': tokenizer.encode(f'kes benar: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length=256)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['Jom makan di us, makanan di sana sedap.',
 'KUALA LUMPUR: Menteri di Jabatan Perdana Menteri, Datuk Seri Dr Mujahid Yusof Rawa hari ini mengakhiri lawatan kerja lapan hari ke Jordan, Turki dan Bosnia Herzegovina, lawatan yang bertujuan mengeratkan lagi hubungan dua hala dengan ketiga-tiga negara berkenaan.']

In [8]:
model.push_to_hub('finetune-true-case-t5-small-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-true-case-t5-small-standard-bahasa-cased into local empty directory.


Upload file pytorch_model.bin:   0%|          | 4.00k/231M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-true-case-t5-small-standard-bahasa-cased
   69be467..5b8e544  main -> main



'https://huggingface.co/mesolitica/finetune-true-case-t5-small-standard-bahasa-cased/commit/5b8e54486f7c3a3e1bc3e77c7d631c9b893b1056'

In [9]:
tokenizer.push_to_hub('finetune-true-case-t5-small-standard-bahasa-cased', organization='mesolitica')

Upload file spiece.model:   1%|          | 4.00k/784k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-true-case-t5-small-standard-bahasa-cased
   5b8e544..8ac1576  main -> main



'https://huggingface.co/mesolitica/finetune-true-case-t5-small-standard-bahasa-cased/commit/8ac15768f5c5ca5db63306034b5e77ea8a6c0b64'

In [11]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/true-case/test-set-true-case.json

In [12]:
import json

with open('test-set-true-case.json') as fopen:
    data = json.load(fopen)

In [13]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)

def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [14]:
from tqdm import tqdm

wer, cer = [], []
for i in tqdm(range(len(data))):
    input_ids = [{'input_ids': tokenizer.encode(f'kes benar: {data[i][0]}', return_tensors='pt')[0]}]
    padded = tokenizer.pad(input_ids, padding='longest')
    outputs = model.generate(**padded, max_length=256)
    predicted = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    actual = data[i][1]
    wer.append(calculate_wer(actual, predicted))
    cer.append(calculate_cer(actual, predicted))

100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 32404/32404 [1:56:54<00:00,  4.62it/s]


In [15]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.08110462547137363, 0.01638382307614032)