In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForTokenClassification

  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))


In [3]:
with open('prepared.json') as fopen:
    data = json.load(fopen)

In [4]:
import itertools
merged = list(itertools.chain(*(data['train_Y'] + data['test_Y'])))

In [5]:
labels = ['OTHER'] + sorted(set(merged))
labels_tag = {i: no for no, i in enumerate(labels)}
label_list = list(labels_tag.keys())
label_list

['OTHER',
 'ADJ',
 'ADP',
 'ADV',
 'AUX',
 'CCONJ',
 'DET',
 'NOUN',
 'NUM',
 'PART',
 'PRON',
 'PROPN',
 'PUNCT',
 'SCONJ',
 'SYM',
 'VERB',
 'X']

In [6]:
config = T5Config.from_pretrained('mesolitica/translation-t5-tiny-standard-bahasa-cased')
config.num_labels = len(labels_tag)
config.vocab = labels_tag
config.rev_vocab = {v: k for v, k in labels_tag.items()}

In [7]:
model = T5ForTokenClassification.from_pretrained('mesolitica/translation-t5-tiny-standard-bahasa-cased', config = config)
_ = model.cuda()

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at mesolitica/translation-t5-tiny-standard-bahasa-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [8]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/translation-t5-tiny-standard-bahasa-cased', add_prefix_space = True)

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabular

In [9]:
train = []
for i in range(len(data['train_X'][:200000])):
    if len(data['train_X'][i]) != len(data['train_Y'][i]):
        continue
        
    train.append({
        'tokens': data['train_X'][i],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
for i in range(len(data['train_X'][200000:400000])):
    if len(data['train_X'][i]) != len(data['train_Y'][i]):
        continue
    
    train.append({
        'tokens': [t.lower() for t in data['train_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
random.shuffle(train)
train = pd.DataFrame(train).to_dict(orient = 'list')
len(train['tokens'])

400000

In [10]:
test = []
for i in range(len(data['test_X'][:2000])):
    test.append({
        'tokens': data['test_X'][i],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
for i in range(len(data['test_X'][2000:4000])):
    test.append({
        'tokens': [t.lower() for t in data['test_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
random.shuffle(test)
test = pd.DataFrame(test).to_dict(orient = 'list')
len(test['tokens'])

4000

In [11]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
train = tokenize_and_align_labels(train)
test = tokenize_and_align_labels(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
def padding(x, y):
    padded = tokenizer.pad([{'input_ids': x_} for x_ in x], return_tensors = 'pt')
    sequence_length = padded['input_ids'].shape[1]
    labels = [l + [-100] * (sequence_length - len(l)) for l in y]
    labels = np.array(labels)
    padded['labels'] = torch.from_numpy(labels)
    for k in padded.keys():
        padded[k] = padded[k].cuda()
    return padded

In [14]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [15]:
len(train['input_ids'])

400000

In [16]:
i = 0
batch_size = 5
x = test['input_ids'][i: i + batch_size]
y = test['labels'][i: i + batch_size]
padded = padding(x, y)

loss, pred = model(**padded)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [17]:
import evaluate

seqeval = evaluate.load("seqeval")

In [18]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train['input_ids']), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train['input_ids'][i: i + batch_size]
        y = train['labels'][i: i + batch_size]
        padded = padding(x, y)
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test['input_ids']), batch_size):
        x = test['input_ids'][i: i + batch_size]
        y = test['labels'][i: i + batch_size]
        padded = padding(x, y)
        
        loss, pred = model(**padded)
        predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
        dev_predicted.extend(predictions)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    
    dev_predicted = seqeval.compute(predictions=true_predictions, references=true_labels)['overall_f1']
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('tiny')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [09:17<00:00, 44.83it/s]


epoch: 0, loss: 0.1522444359567389, dev_predicted: 0.9406257698940625


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [09:16<00:00, 44.93it/s]


epoch: 1, loss: 0.0923361289499607, dev_predicted: 0.9408393632416786


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████| 25000/25000 [05:58<00:00, 69.79it/s]


epoch: 2, loss: 0.06339012648392701, dev_predicted: 0.9403532155948017


In [19]:
model_ = T5ForTokenClassification.from_pretrained('tiny')
_ = model_.cuda()

In [20]:
dev_predicted = []
for i in range(0, len(test['input_ids']), batch_size):
    x = test['input_ids'][i: i + batch_size]
    y = test['labels'][i: i + batch_size]
    padded = padding(x, y)

    loss, pred = model_(**padded)
    predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
    dev_predicted.extend(predictions)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]

seqeval.compute(predictions=true_predictions, references=true_labels)

{'ART': {'precision': 0.8938547486033519,
  'recall': 0.9411764705882353,
  'f1': 0.9169054441260744,
  'number': 170},
 'CONJ': {'precision': 0.9713905522288756,
  'recall': 0.9785522788203753,
  'f1': 0.974958263772955,
  'number': 1492},
 'DJ': {'precision': 0.9192897497982244,
  'recall': 0.88984375,
  'f1': 0.9043271139341008,
  'number': 1280},
 'DP': {'precision': 0.9770908087220536,
  'recall': 0.9844271412680756,
  'f1': 0.9807452555755645,
  'number': 3596},
 'DV': {'precision': 0.9478672985781991,
  'recall': 0.9523809523809523,
  'f1': 0.9501187648456056,
  'number': 1260},
 'ERB': {'precision': 0.9654357459379616,
  'recall': 0.9662921348314607,
  'f1': 0.9658637505541599,
  'number': 3382},
 'ET': {'precision': 0.9603854389721628,
  'recall': 0.9542553191489361,
  'f1': 0.9573105656350054,
  'number': 940},
 'OUN': {'precision': 0.8789933694996986,
  'recall': 0.8976608187134503,
  'f1': 0.8882290239074159,
  'number': 6498},
 'RON': {'precision': 0.9888991674375578,
  'r

In [21]:
model_.push_to_hub('mesolitica/pos-t5-tiny-standard-bahasa-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/84.7M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/pos-t5-tiny-standard-bahasa-cased/commit/510e71534bfd2a185b37f0f139b58122ae4be901', commit_message='Upload T5ForTokenClassification', commit_description='', oid='510e71534bfd2a185b37f0f139b58122ae4be901', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('mesolitica/pos-t5-tiny-standard-bahasa-cased')

spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/pos-t5-tiny-standard-bahasa-cased/commit/fd551136cfc3317249884e51ae71fa80784d5684', commit_message='Upload tokenizer', commit_description='', oid='fd551136cfc3317249884e51ae71fa80784d5684', pr_url=None, pr_revision=None, pr_num=None)