In [1]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [2]:
from glob import glob

checkpoints = sorted(glob('finetune-t5-tiny-standard-bahasa-cased/checkpoint-*'))
checkpoints

['finetune-t5-tiny-standard-bahasa-cased/checkpoint-1420000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-1430000',
 'finetune-t5-tiny-standard-bahasa-cased/checkpoint-1440000']

In [3]:
from transformers import T5Tokenizer, T5ForConditionalGeneration

tokenizer = T5Tokenizer.from_pretrained('mesolitica/t5-small-standard-bahasa-cased')

Downloading:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.79k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.86k [00:00<?, ?B/s]

In [4]:
model = T5ForConditionalGeneration.from_pretrained(checkpoints[-1])

In [5]:
string1 = 'huseinsukamakan ayam,dia sgtrisaukan'
string2 = 'drmahathir sangat menekankan budaya budakzamansekarang'
string3 = 'ceritatunnajibrazak'
string4 = 'TunM sukakan'
string_hard = 'IPOH-AhliDewanUndangan Negeri(ADUN) HuluKinta, MuhamadArafat Varisai Mahamadmenafikanmesejtularmendakwa beliau akan melompatparti menyokong UMNO membentuk kerajaannegeridiPerak.BeliauyangjugaKetua Penerangan Parti Keadilan Rakyat(PKR)Perak dalam satumesejringkaskepadaSinar Harian menjelaskan perkara itutidakbenarsama sekali.'
string_socialmedia = 'aqxsukalah apeyg tejadidekat mamattu'
string5 = 'ihate chicken, but ilike fish'
string6 = 'Higuys! I noticedsemalam & harini dahramai yangdapat cookiesni kan. So hariniinak sharesome post mortemof our first batch:'

In [6]:
strings = [
    string1,
    string2,
    string3,
    string4,
    string_hard,
    string_socialmedia,
    string5,
    string6
]

In [7]:
input_ids = [{'input_ids': tokenizer.encode(f'segmentasi: {s}', return_tensors='pt')[
    0]} for s in strings]
padded = tokenizer.pad(input_ids, padding='longest')
outputs = model.generate(**padded, max_length=256)
tokenizer.batch_decode(outputs, skip_special_tokens=True)

['husein suka makan ayam, dia sgt risikokan',
 'dr mahathir sangat menekankan budaya budak zaman sekarang',
 'cerita tun najib razak',
 'Tun M sukakan',
 'IPOH - Ahli Dewan Undangan Negeri (ADUN) Hulu Kinta, Muhamad Arafat Varisai Mahamad menafikan mesej tular mendakwa beliau akan melompat parti menyokong UMNO membentuk kerajaan negeri di Perak. Beliau yang juga Ketua Penerangan Parti Keadilan Rakyat (PKR) Perak dalam satu mesej ringkas kepada Sinar Harian menjelaskan perkara itu tidak benar sama sekali.',
 'aq x sukalah ape yg tejadi dekat mamat tu',
 'i hate chicken, but i like fish',
 'Hi guys! I noticed semalam & harini dah ramai yang dapat cookies ni kan. So harini i nak share some post mortem of our first batch:']

In [8]:
model.push_to_hub('finetune-segmentation-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Cloning https://huggingface.co/mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased into local empty directory.


Upload file pytorch_model.bin:   0%|          | 32.0k/133M [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased
   367db2a..a0a5a29  main -> main



'https://huggingface.co/mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased/commit/a0a5a29410ebdd36654d637a7fc51f299ab7ce59'

In [9]:
tokenizer.push_to_hub('finetune-segmentation-t5-tiny-standard-bahasa-cased', organization='mesolitica')

Upload file spiece.model:   4%|4         | 32.0k/784k [00:00<?, ?B/s]

remote: Scanning LFS files for validity, may be slow...        
remote: LFS file scan complete.        
To https://huggingface.co/mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased
   a0a5a29..b77b5c4  main -> main



'https://huggingface.co/mesolitica/finetune-segmentation-t5-tiny-standard-bahasa-cased/commit/b77b5c4377a0607aed4f2441d10e1f930bb14400'

In [8]:
import json

with open('test-set-segmentation.json') as fopen:
    data = json.load(fopen)

In [9]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)

def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())

In [10]:
from tqdm import tqdm

wer, cer = [], []
for i in tqdm(range(len(data[:10000]))):
    input_ids = [{'input_ids': tokenizer.encode(f'segmentasi: {data[i][0]}', return_tensors='pt')[0]}]
    padded = tokenizer.pad(input_ids, padding='longest')
    
#     for k in padded.keys():
#         padded[k] = padded[k].cuda()
    
    outputs = model.generate(**padded, max_length=256)
    predicted = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
    actual = data[i][1]
    wer.append(calculate_wer(actual, predicted))
    cer.append(calculate_cer(actual, predicted))

100%|██████████████████████████████████████████████████████████████████████████████████████████████| 10000/10000 [2:06:06<00:00,  1.32it/s]


In [12]:
import numpy as np

np.mean(wer), np.mean(cer)

(0.02078761271712765, 0.0021466911610055553)