In [1]:
from transformers import AutoTokenizer, ByT5Tokenizer

In [2]:
tokenizer = ByT5Tokenizer()
tokenizer

Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


ByT5Tokenizer(name_or_path='', vocab_size=256, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_i

In [3]:
import json

with open('train-syllable.json') as fopen:
    data = json.load(fopen)
    
X = [d[0] for d in data]
Y = [d[1] for d in data]
len(X)

37079

In [4]:
with open('test-syllable.json') as fopen:
    data = json.load(fopen)
    
test_X = [d[0] for d in data]
test_Y = [d[1] for d in data]
len(test_X)

1952

In [5]:
tokenizer.vocab_size

256

In [6]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F

In [7]:
SOS_token = 0
EOS_token = 2

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size, num_layers = 2, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size, num_layers = 2, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        
    @property
    def device(self):
        return next(self.parameters()).device

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=self.device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        if target_tensor is not None:
            maxlen = target_tensor.shape[1]
        else:
            maxlen = encoder_outputs.shape[1] * 2

        for i in range(maxlen):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [8]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Model, self).__init__()
        self.encoder = EncoderRNN(input_size, hidden_size, dropout_p=dropout_p)
        self.decoder = DecoderRNN(hidden_size, input_size)
    
    @property
    def device(self):
        return next(self.parameters()).device
    
    def forward(self, input_tensor, target_tensor=None):
        input_tensor = input_tensor.to(self.device)
        if target_tensor is not None:
            target_tensor = target_tensor.to(self.device)
        encoder_out = self.encoder(input_tensor)
        return self.decoder(*encoder_out, target_tensor)
    
    def greedy_decoder(self, input_tensor):
        input_tensor = input_tensor.to(self.device)
        with torch.no_grad():
            encoder_out = self.encoder(input_tensor)
            decoder_outputs, decoder_hidden, _ = self.decoder(*encoder_out)
            return decoder_outputs

In [9]:
model = Model(tokenizer.vocab_size, 512)
_ = model.cuda()

In [10]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.Adam(trainable_parameters, lr = 5e-4)
criterion = nn.NLLLoss()

In [11]:
strings = [f'{s}</s>' for s in X[:10]]
targets = [f'{s}</s>' for s in Y[:10]]
input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
targets = tokenizer(targets, padding = True, return_tensors = 'pt')['input_ids']
labels = torch.clone(targets)
labels[targets==0] = -100
labels = labels.to(model.device)



In [12]:
out = model(input_ids, targets)
decoder_outputs = out[0]

In [13]:
out = model.greedy_decoder(input_ids)

In [14]:
loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        labels.view(-1)
    )
loss

tensor(5.5530, device='cuda:0', grad_fn=<NllLossBackward0>)

In [17]:
!ls -lh

total 54M
-rw-r--r-- 1 husein husein  34M Okt  11 08:18 gru-bahdanau-512.pt
-rw-r--r-- 1 husein husein  20M Okt  11 08:14 lstm-2-384.pt
-rw-r--r-- 1 husein husein  42K Okt  11 08:17 lstm-384.ipynb
-rw-r--r-- 1 husein husein  42K Okt  11 08:17 lstm-512.ipynb
-rw-rw-r-- 1 husein husein  544 Sep  17  2022 README.md
drwxr-xr-x 2 husein husein 4.0K Okt  10 00:47 super-tiny
-rw-r--r-- 1 husein husein  17K Okt  10 00:59 super-tiny.ipynb
-rw-r--r-- 1 husein husein  55K Okt  10 00:38 test-syllable.json
-rw-rw-r-- 1 husein husein  42K Sep  17  2022 train-syllable.ipynb
-rw-r--r-- 1 husein husein 1.1M Okt  10 00:38 train-syllable.json


In [41]:
_ = model.train()

In [42]:
from tqdm import tqdm
import numpy as np

batch_size = 32
epoch = 100

best_dev_acc = -np.inf
patient = 2
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = X[i: i + batch_size]
        y = Y[i: i + batch_size]
        strings = [f'{s}</s>' for s in x]
        targets = [f'{s}</s>' for s in y]
        input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
        targets = tokenizer(targets, padding = True, return_tensors = 'pt')['input_ids']
        labels = torch.clone(targets)
        labels[targets==0] = -100
        labels = labels.to(model.device)
        
        out = model(input_ids, targets)
        decoder_outputs = out[0]
        
        loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                labels.view(-1)
            )
        loss.backward()
        trainer.step()
        
        p = decoder_outputs.detach().cpu().numpy().argmax(axis = -1)
        l = labels.detach().cpu().numpy()
        mask = l != -100
        accuracy = ((p == l)[mask]).mean()
        losses.append(accuracy)
        pbar.set_description(f"loss {loss}, accuracy {accuracy}")
    
    pbar = tqdm(range(0, len(test_X), batch_size))
    dev_predicted = []
    for i in pbar:
        x = test_X[i: i + batch_size]
        y = test_Y[i: i + batch_size]
        strings = [f'{s}</s>' for s in x]
        targets = [f'{s}</s>' for s in y]
        input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
        targets = tokenizer(targets, padding = True, return_tensors = 'pt')['input_ids']
        labels = torch.clone(targets)
        labels[targets==0] = -100
        labels = labels.to(model.device)
        
        out = model(input_ids, targets)
        decoder_outputs = out[0]
        
        loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                labels.view(-1)
            )
        
        p = decoder_outputs.detach().cpu().numpy().argmax(axis = -1)
        l = labels.detach().cpu().numpy()
        mask = l != -100
        accuracy = ((p == l)[mask]).mean()
        dev_predicted.append(accuracy)
        pbar.set_description(f"loss {loss}, accuracy {accuracy}")
    
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, accuracy: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

loss 0.012772278860211372, accuracy 0.992619926199262: 100%|█| 1159/1159 [00:24<
loss 0.01718957908451557, accuracy 0.9946236559139785: 100%|█| 61/61 [00:00<00:0


epoch: 0, accuracy: 0.997514750060473, dev_predicted: 0.9943462508219524


loss 0.010656280443072319, accuracy 0.996309963099631: 100%|█| 1159/1159 [00:24<
loss 0.008538487367331982, accuracy 0.9946236559139785: 100%|█| 61/61 [00:00<00:


epoch: 1, accuracy: 0.9976285521766356, dev_predicted: 0.9932315850973804


loss 0.01699027791619301, accuracy 0.992619926199262: 100%|█| 1159/1159 [00:24<0
loss 0.006666738074272871, accuracy 0.9973118279569892: 100%|█| 61/61 [00:00<00:

epoch: 2, accuracy: 0.9979525024543698, dev_predicted: 0.9940706215470508





In [43]:
_ = model.eval()

In [44]:
strings = ['kelemahan', 'mengilukan', 'angan-angan']
strings = [f'{s}</s>' for s in strings]
input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
out = model.greedy_decoder(input_ids)

In [45]:
tokenizer.batch_decode(out.detach().cpu().numpy().argmax(axis = -1), skip_special_tokens = True)

['ke.le.ma.han', 'me.ngi.lu.kan', 'a.ngan.a.ngan']

In [46]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())


In [47]:
_ = model.eval()

In [48]:
cers, wers = [], []

for i in tqdm(range(0, len(test_X), 1)):
    x = test_X[i: i + 1]
    strings = [f'{s}</s>' for s in x]
    input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
    out = model.greedy_decoder(input_ids)
    out = tokenizer.batch_decode(out.detach().cpu().numpy().argmax(axis = -1), skip_special_tokens = True)
    cers.append(calculate_cer(test_Y[i], out[0]))
    wers.append(calculate_wer(test_Y[i], out[0]))

100%|██████████████████████████████████████| 1952/1952 [00:13<00:00, 141.82it/s]


In [49]:
np.mean(cers), np.mean(wers)

(0.011996584781229728, 0.06915983606557377)

In [50]:
torch.save(model.state_dict(), 'lstm-2-384.pt')

In [51]:
tokenizer.push_to_hub('mesolitica/syllable-lstm')

CommitInfo(commit_url='https://huggingface.co/mesolitica/syllable-lstm/commit/963000b0f2ef2921935d9fdf807a34e6b57f7c94', commit_message='Upload tokenizer', commit_description='', oid='963000b0f2ef2921935d9fdf807a34e6b57f7c94', pr_url=None, pr_revision=None, pr_num=None)

In [52]:
from malaya_boilerplate.huggingface import upload_dict

In [53]:
upload_dict('syllable-lstm', {'lstm-2-384.pt': 'model.pt'}, username = 'mesolitica')

409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-6525ebba-5cdf29ba00a1760753c8cc05;e309c471-b9c7-4768-8a91-09478d69747d)

You already created this model repo


lstm-2-384.pt:   0%|          | 0.00/35.2M [00:00<?, ?B/s]