In [1]:
from transformers import AutoTokenizer, ByT5Tokenizer

tokenizer = ByT5Tokenizer()
tokenizer

Using bos_token, but it is not set yet.
Using sep_token, but it is not set yet.
Using cls_token, but it is not set yet.
Using mask_token, but it is not set yet.


ByT5Tokenizer(name_or_path='', vocab_size=256, model_max_length=1000000000000000019884624838656, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'eos_token': '</s>', 'unk_token': '<unk>', 'pad_token': '<pad>', 'additional_special_tokens': ['<extra_id_0>', '<extra_id_1>', '<extra_id_2>', '<extra_id_3>', '<extra_id_4>', '<extra_id_5>', '<extra_id_6>', '<extra_id_7>', '<extra_id_8>', '<extra_id_9>', '<extra_id_10>', '<extra_id_11>', '<extra_id_12>', '<extra_id_13>', '<extra_id_14>', '<extra_id_15>', '<extra_id_16>', '<extra_id_17>', '<extra_id_18>', '<extra_id_19>', '<extra_id_20>', '<extra_id_21>', '<extra_id_22>', '<extra_id_23>', '<extra_id_24>', '<extra_id_25>', '<extra_id_26>', '<extra_id_27>', '<extra_id_28>', '<extra_id_29>', '<extra_id_30>', '<extra_id_31>', '<extra_id_32>', '<extra_id_33>', '<extra_id_34>', '<extra_id_35>', '<extra_id_36>', '<extra_id_37>', '<extra_id_38>', '<extra_id_39>', '<extra_id_40>', '<extra_id_41>', '<extra_id_42>', '<extra_i

In [2]:
import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
import random

In [3]:
SOS_token = 0
EOS_token = 2

class EncoderRNN(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(EncoderRNN, self).__init__()
        self.hidden_size = hidden_size

        self.embedding = nn.Embedding(input_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size, num_layers = 2, batch_first=True)
        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input):
        embedded = self.dropout(self.embedding(input))
        output, hidden = self.gru(embedded)
        return output, hidden

class DecoderRNN(nn.Module):
    def __init__(self, hidden_size, output_size):
        super(DecoderRNN, self).__init__()
        self.embedding = nn.Embedding(output_size, hidden_size)
        self.gru = nn.LSTM(hidden_size, hidden_size, num_layers = 2, batch_first=True)
        self.out = nn.Linear(hidden_size, output_size)
        
    @property
    def device(self):
        return next(self.parameters()).device

    def forward(self, encoder_outputs, encoder_hidden, target_tensor=None):
        batch_size = encoder_outputs.size(0)
        decoder_input = torch.empty(batch_size, 1, dtype=torch.long, device=self.device).fill_(SOS_token)
        decoder_hidden = encoder_hidden
        decoder_outputs = []

        if target_tensor is not None:
            maxlen = target_tensor.shape[1]
        else:
            maxlen = encoder_outputs.shape[1] * 2

        for i in range(maxlen):
            decoder_output, decoder_hidden  = self.forward_step(decoder_input, decoder_hidden)
            decoder_outputs.append(decoder_output)

            if target_tensor is not None:
                # Teacher forcing: Feed the target as the next input
                decoder_input = target_tensor[:, i].unsqueeze(1) # Teacher forcing
            else:
                # Without teacher forcing: use its own predictions as the next input
                _, topi = decoder_output.topk(1)
                decoder_input = topi.squeeze(-1).detach()  # detach from history as input

        decoder_outputs = torch.cat(decoder_outputs, dim=1)
        decoder_outputs = F.log_softmax(decoder_outputs, dim=-1)
        return decoder_outputs, decoder_hidden, None # We return `None` for consistency in the training loop

    def forward_step(self, input, hidden):
        output = self.embedding(input)
        output = F.relu(output)
        output, hidden = self.gru(output, hidden)
        output = self.out(output)
        return output, hidden

In [4]:
class Model(nn.Module):
    def __init__(self, input_size, hidden_size, dropout_p=0.1):
        super(Model, self).__init__()
        self.encoder = EncoderRNN(input_size, hidden_size, dropout_p=dropout_p)
        self.decoder = DecoderRNN(hidden_size, input_size)
    
    @property
    def device(self):
        return next(self.parameters()).device
    
    def forward(self, input_tensor, target_tensor=None):
        input_tensor = input_tensor.to(self.device)
        if target_tensor is not None:
            target_tensor = target_tensor.to(self.device)
        encoder_out = self.encoder(input_tensor)
        return self.decoder(*encoder_out, target_tensor)
    
    def greedy_decoder(self, input_tensor):
        input_tensor = input_tensor.to(self.device)
        with torch.no_grad():
            encoder_out = self.encoder(input_tensor)
            decoder_outputs, decoder_hidden, _ = self.decoder(*encoder_out)
            return decoder_outputs

In [5]:
model = Model(tokenizer.vocab_size, 512)
_ = model.cuda()

In [6]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.Adam(trainable_parameters, lr = 5e-4)
criterion = nn.NLLLoss()

In [7]:
import json
from sklearn.utils import shuffle

with open('train_noisy_stem.json') as fopen:
    noisy = json.load(fopen)
    
with open('train_stem.json') as fopen:
    data = json.load(fopen)
    
X = data['X'] + noisy['X']
Y = data['Y'] + noisy['Y']
X, Y = shuffle(X, Y)

In [8]:
with open('test_noisy_stem.json') as fopen:
    noisy = json.load(fopen)
    
with open('test_stem.json') as fopen:
    data = json.load(fopen)
    
test_X = data['X'] + noisy['X']
test_Y = data['Y'] + noisy['Y']
test_X, test_Y = shuffle(test_X, test_Y)

In [17]:
strings = [f'{s}' for s in X[:10]]
targets = [f'{s}' for s in Y[:10]]
input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
targets = tokenizer(targets, padding = True, return_tensors = 'pt')['input_ids']
labels = torch.clone(targets)
labels[targets==0] = -100
labels = labels.to(model.device)

In [13]:
out = model(input_ids, targets)
decoder_outputs = out[0]

In [14]:
loss = criterion(
        decoder_outputs.view(-1, decoder_outputs.size(-1)),
        labels.view(-1)
    )
loss

tensor(5.5357, device='cuda:0', grad_fn=<NllLossBackward0>)

In [16]:
# model.greedy_decoder(input_ids)

In [37]:
input_ids[0]

tensor([103, 103, 103, 108, 118, 100, 103, 100, 115,   1,   0,   0,   0,   0,
          0])

In [39]:
targets[0]

tensor([118, 100, 103, 100, 115,   1,   0,   0,   0,   0,   0,   0,   0,   0])

In [40]:
from tqdm import tqdm
import numpy as np

batch_size = 32
epoch = 100

best_dev_acc = -np.inf
patient = 2
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = X[i: i + batch_size]
        y = Y[i: i + batch_size]
        strings = [s for s in x]
        targets = [s for s in y]
        input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
        targets = tokenizer(targets, padding = True, return_tensors = 'pt')['input_ids']
        labels = torch.clone(targets)
        labels[targets==0] = -100
        labels = labels.to(model.device)
        
        out = model(input_ids, targets)
        decoder_outputs = out[0]
        
        loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                labels.view(-1)
            )
        loss.backward()
        trainer.step()
        
        p = decoder_outputs.detach().cpu().numpy().argmax(axis = -1)
        l = labels.detach().cpu().numpy()
        mask = l != -100
        accuracy = ((p == l)[mask]).mean()
        losses.append(accuracy)
        pbar.set_description(f"loss {loss}, accuracy {accuracy}")
    
    pbar = tqdm(range(0, len(test_X[:10000]), batch_size))
    dev_predicted = []
    for i in pbar:
        x = test_X[i: i + batch_size]
        y = test_Y[i: i + batch_size]
        strings = [s for s in x]
        targets = [s for s in y]
        input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
        targets = tokenizer(targets, padding = True, return_tensors = 'pt')['input_ids']
        labels = torch.clone(targets)
        labels[targets==0] = -100
        labels = labels.to(model.device)
        
        out = model(input_ids, targets)
        decoder_outputs = out[0]
        
        loss = criterion(
                decoder_outputs.view(-1, decoder_outputs.size(-1)),
                labels.view(-1)
            )
        
        p = decoder_outputs.detach().cpu().numpy().argmax(axis = -1)
        l = labels.detach().cpu().numpy()
        mask = l != -100
        accuracy = ((p == l)[mask]).mean()
        dev_predicted.append(accuracy)
        pbar.set_description(f"loss {loss}, accuracy {accuracy}")
    
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, accuracy: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

loss 8.212082320824265e-05, accuracy 1.0: 100%|█| 91780/91780 [31:59<00:00, 47.8
loss 0.025158653035759926, accuracy 0.984251968503937:  45%|▍| 141/313 [00:01<00IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

loss 0.013289958238601685, accuracy 0.9959349593495935:  27%|▎| 24851/91780 [08:IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)

loss 0.04349181428551674, accuracy 0.987603305785124:  55%|▌| 50227/91780 [17:28IOPub message rate exceeded.


KeyboardInterrupt: 

In [22]:
def calculate_cer(actual, hyp):
    """
    Calculate CER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    actual = actual.replace(' ', '')
    hyp = hyp.replace(' ', '')
    return Lev.distance(actual, hyp) / len(actual)


def calculate_wer(actual, hyp):
    """
    Calculate WER using `python-Levenshtein`.
    """
    import Levenshtein as Lev

    b = set(actual.split() + hyp.split())
    word2char = dict(zip(b, range(len(b))))

    w1 = [chr(word2char[w]) for w in actual.split()]
    w2 = [chr(word2char[w]) for w in hyp.split()]

    return Lev.distance(''.join(w1), ''.join(w2)) / len(actual.split())


In [23]:
from tqdm import tqdm
import numpy as np

In [55]:
cers, wers = [], []

batch_size = 4
for i in tqdm(range(0, len(test_X), batch_size)):
    x = test_X[i: i + batch_size]
    y = test_Y[i: i + batch_size]
    strings = [s for s in x]
    input_ids = tokenizer(strings, padding = True, return_tensors = 'pt')['input_ids']
    out = model.greedy_decoder(input_ids)
    argmax = out.detach().cpu().numpy().argmax(axis = -1)
    results = []
    for a in argmax:
        results_ = []
        for a_ in a:
            if a_ == tokenizer.eos_token_id:
                break
            else:
                results_.append(a_)
        results.append(results_)
    out = tokenizer.batch_decode(results)
    for no, y_ in enumerate(y):
        cers.append(calculate_cer(y_, out[no]))
        wers.append(calculate_wer(y_, out[no]))

100%|██████████████████████████████████| 118204/118204 [14:55<00:00, 132.04it/s]


In [56]:
np.mean(cers), np.mean(wers)

(0.02549779186652238, 0.05448552235248484)

In [59]:
torch.save(model.state_dict(), 'lstm-2-512.pt')

In [58]:
tokenizer.push_to_hub('mesolitica/stem-lstm-512')

CommitInfo(commit_url='https://huggingface.co/mesolitica/stem-lstm-512/commit/bce6de34f8480ee3a6881ec8c285d3714abefed8', commit_message='Upload tokenizer', commit_description='', oid='bce6de34f8480ee3a6881ec8c285d3714abefed8', pr_url=None, pr_revision=None, pr_num=None)

In [60]:
from malaya_boilerplate.huggingface import upload_dict

In [61]:
upload_dict('stem-lstm-512', {'lstm-2-512.pt': 'model.pt'}, username = 'mesolitica')

409 Client Error: Conflict for url: https://huggingface.co/api/repos/create (Request ID: Root=1-65266bf2-17c7641474c6278f225b304e;4bcf94bb-96ce-460f-a3c5-434e4a16242a)

You already created this model repo


lstm-2-512.pt:   0%|          | 0.00/35.2M [00:00<?, ?B/s]