In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForTokenClassification

2023-09-24 01:32:20.757854: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-24 01:32:20.833507: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-24 01:32:21.281073: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-24 01:32:21.281116: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [3]:
labels_tag = {
    'OTHER': 0,
    'law': 1,
    'location': 2,
    'organization': 3,
    'person': 4,
    'quantity': 5,
    'time': 6,
    'event': 7,
}
label_list = list(labels_tag.keys())
label_list

['OTHER',
 'law',
 'location',
 'organization',
 'person',
 'quantity',
 'time',
 'event']

In [4]:
config = T5Config.from_pretrained('mesolitica/nanot5-base-malaysian-cased')
config.num_labels = len(labels_tag)
config.vocab = labels_tag
config.rev_vocab = {v: k for v, k in labels_tag.items()}

In [5]:
model = T5ForTokenClassification.from_pretrained('mesolitica/nanot5-base-malaysian-cased', config = config)
_ = model.cuda()

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at mesolitica/nanot5-base-malaysian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased', add_prefix_space = True)

In [7]:
with open('prepared.json') as fopen:
    data = json.load(fopen)

In [8]:
train = []
for i in range(len(data['train_X'])):
    if len(data['train_X'][i]) != len(data['train_Y'][i]):
        continue
        
    train.append({
        'tokens': data['train_X'][i],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
    train.append({
        'tokens': [t.lower() for t in data['train_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
random.shuffle(train)
train = pd.DataFrame(train).to_dict(orient = 'list')
len(train['tokens'])

28056

In [9]:
test = []
for i in range(len(data['test_X'])):
    test.append({
        'tokens': data['test_X'][i],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
    test.append({
        'tokens': [t.lower() for t in data['test_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
random.shuffle(test)
test = pd.DataFrame(test).to_dict(orient = 'list')
len(test['tokens'])

6260

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
def tokenize_and_align_predict(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)
    tags = [[1] * len(t) for t in examples["tokens"]]

    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [12]:
train = tokenize_and_align_labels(train)
test = tokenize_and_align_labels(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [13]:
train.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [14]:
def padding(x, y):
    padded = tokenizer.pad([{'input_ids': x_} for x_ in x], return_tensors = 'pt')
    sequence_length = padded['input_ids'].shape[1]
    labels = [l + [-100] * (sequence_length - len(l)) for l in y]
    labels = np.array(labels)
    padded['labels'] = torch.from_numpy(labels)
    for k in padded.keys():
        padded[k] = padded[k].cuda()
    return padded

In [15]:
# model(**padded)[0]

In [16]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [17]:
import evaluate

seqeval = evaluate.load("seqeval")

In [18]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train['input_ids']), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train['input_ids'][i: i + batch_size]
        y = train['labels'][i: i + batch_size]
        padded = padding(x, y)
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test['input_ids']), batch_size):
        x = test['input_ids'][i: i + batch_size]
        y = test['labels'][i: i + batch_size]
        padded = padding(x, y)
        
        loss, pred = model(**padded)
        predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
        dev_predicted.extend(predictions)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    
    dev_predicted = seqeval.compute(predictions=true_predictions, references=true_labels)['overall_f1']
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('base')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

  0%|                                                  | 0/1754 [00:00<?, ?it/s]You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.
100%|███████████████████████████████████████| 1754/1754 [03:48<00:00,  7.68it/s]


epoch: 0, loss: 0.04070882041368606, dev_predicted: 0.9623318493273973


100%|███████████████████████████████████████| 1754/1754 [03:49<00:00,  7.63it/s]


epoch: 1, loss: 0.008909668091530201, dev_predicted: 0.9635613902213247


100%|███████████████████████████████████████| 1754/1754 [03:50<00:00,  7.61it/s]


epoch: 2, loss: 0.007211462933079131, dev_predicted: 0.9629450581325083


In [19]:
model_ = T5ForTokenClassification.from_pretrained('base')
_ = model_.cuda()

In [20]:
model_.push_to_hub('mesolitica/ner-nanot5-base-malaysian-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/439M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/ner-nanot5-base-malaysian-cased/commit/0121280c58e23dadf9aa05827de60d3ee4501390', commit_message='Upload T5ForTokenClassification', commit_description='', oid='0121280c58e23dadf9aa05827de60d3ee4501390', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
tokenizer.push_to_hub('mesolitica/ner-nanot5-base-malaysian-cased')

CommitInfo(commit_url='https://huggingface.co/mesolitica/ner-nanot5-base-malaysian-cased/commit/c897ceb78f0653b4ebb72a06943f605adb202d93', commit_message='Upload tokenizer', commit_description='', oid='c897ceb78f0653b4ebb72a06943f605adb202d93', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
dev_predicted = []
for i in range(0, len(test['input_ids']), batch_size):
    x = test['input_ids'][i: i + batch_size]
    y = test['labels'][i: i + batch_size]
    padded = padding(x, y)

    loss, pred = model_(**padded)
    predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
    dev_predicted.extend(predictions)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]

seqeval.compute(predictions=true_predictions, references=true_labels)

{'aw': {'precision': 0.9117832388153749,
  'recall': 0.9359637774902976,
  'f1': 0.9237152888605171,
  'number': 1546},
 'erson': {'precision': 0.9695515018515979,
  'recall': 0.9805798307670966,
  'f1': 0.9750344827586206,
  'number': 14418},
 'ime': {'precision': 0.9671925870272978,
  'recall': 0.971327967806841,
  'f1': 0.9692558664826202,
  'number': 3976},
 'ocation': {'precision': 0.9687299207539087,
  'recall': 0.9783690244430023,
  'f1': 0.9735256134309083,
  'number': 9246},
 'rganization': {'precision': 0.9475433183004984,
  'recall': 0.9610014443909485,
  'f1': 0.9542249312776384,
  'number': 8308},
 'uantity': {'precision': 0.966433050534858,
  'recall': 0.9597069597069597,
  'f1': 0.9630582613490167,
  'number': 2730},
 'vent': {'precision': 0.8358974358974359,
  'recall': 0.8504347826086956,
  'f1': 0.8431034482758621,
  'number': 1150},
 'overall_precision': 0.9585715310833114,
 'overall_recall': 0.96860347077875,
 'overall_f1': 0.9635613902213247,
 'overall_accuracy': 0

In [23]:
string = 'husein makan ayam di kfc'
tokens = string.split()

In [24]:
def tokenize_and_align_predict(tokens):
    tokenized_inputs = tokenizer([tokens], truncation=True, is_split_into_words=True)
    tags = [[1] * len(t) for t in [tokens]]

    labels = []
    for i, label in enumerate(tags):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    return tokenized_inputs, labels[0]

In [25]:
padded, indices = tokenize_and_align_predict(tokens)

In [26]:
for k in padded.keys():
    padded[k] = torch.from_numpy(np.array(padded[k])).cuda()

In [27]:
o = model_(**padded)[0]
o = o.detach().cpu().numpy()[0].argmax(axis = 1)
o

array([4, 4, 4, 0, 0, 0, 2, 2])

In [28]:
filtered = [o[i] for i in range(len(o)) if indices[i] != -100]
filtered

[4, 0, 0, 0, 2]

In [29]:
tokens

['husein', 'makan', 'ayam', 'di', 'kfc']