In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForTokenClassification

2023-09-24 01:31:24.079316: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-24 01:31:24.148152: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-24 01:31:24.584520: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-24 01:31:24.584552: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [3]:
labels_tag = {
    'OTHER': 0,
    'law': 1,
    'location': 2,
    'organization': 3,
    'person': 4,
    'quantity': 5,
    'time': 6,
    'event': 7,
}
label_list = list(labels_tag.keys())
label_list

['OTHER',
 'law',
 'location',
 'organization',
 'person',
 'quantity',
 'time',
 'event']

In [4]:
config = T5Config.from_pretrained('mesolitica/nanot5-small-malaysian-cased')
config.num_labels = len(labels_tag)
config.vocab = labels_tag
config.rev_vocab = {v: k for v, k in labels_tag.items()}

In [5]:
model = T5ForTokenClassification.from_pretrained('mesolitica/nanot5-small-malaysian-cased', config = config)
_ = model.cuda()

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at mesolitica/nanot5-small-malaysian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased', add_prefix_space = True)

In [7]:
with open('prepared.json') as fopen:
    data = json.load(fopen)

In [8]:
train = []
for i in range(len(data['train_X'])):
    if len(data['train_X'][i]) != len(data['train_Y'][i]):
        continue
        
    train.append({
        'tokens': data['train_X'][i],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
    train.append({
        'tokens': [t.lower() for t in data['train_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
random.shuffle(train)
train = pd.DataFrame(train).to_dict(orient = 'list')
len(train['tokens'])

28056

In [9]:
test = []
for i in range(len(data['test_X'])):
    test.append({
        'tokens': data['test_X'][i],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
    test.append({
        'tokens': [t.lower() for t in data['test_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
random.shuffle(test)
test = pd.DataFrame(test).to_dict(orient = 'list')
len(test['tokens'])

6260

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
train = tokenize_and_align_labels(train)
test = tokenize_and_align_labels(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
train.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [13]:
def padding(x, y):
    padded = tokenizer.pad([{'input_ids': x_} for x_ in x], return_tensors = 'pt')
    sequence_length = padded['input_ids'].shape[1]
    labels = [l + [-100] * (sequence_length - len(l)) for l in y]
    labels = np.array(labels)
    padded['labels'] = torch.from_numpy(labels)
    for k in padded.keys():
        padded[k] = padded[k].cuda()
    return padded

In [14]:
# model(**padded)[0]

In [15]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [16]:
len(train['input_ids'])

28056

In [17]:
i = 0
batch_size = 5
x = test['input_ids'][i: i + batch_size]
y = test['labels'][i: i + batch_size]
padded = padding(x, y)

loss, pred = model(**padded)

You're using a GPT2TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [18]:
import evaluate

seqeval = evaluate.load("seqeval")

In [19]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train['input_ids']), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train['input_ids'][i: i + batch_size]
        y = train['labels'][i: i + batch_size]
        padded = padding(x, y)
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test['input_ids']), batch_size):
        x = test['input_ids'][i: i + batch_size]
        y = test['labels'][i: i + batch_size]
        padded = padding(x, y)
        
        loss, pred = model(**padded)
        predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
        dev_predicted.extend(predictions)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    
    dev_predicted = seqeval.compute(predictions=true_predictions, references=true_labels)['overall_f1']
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('small')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|███████████████████████████████████████| 1754/1754 [01:31<00:00, 19.10it/s]
  _warn_prf(average, modifier, msg_start, len(result))


epoch: 0, loss: 0.7352756115022787, dev_predicted: 0.25806451612903225


100%|███████████████████████████████████████| 1754/1754 [01:33<00:00, 18.72it/s]


epoch: 1, loss: 0.3179241424785373, dev_predicted: 0.5423064590829741


100%|███████████████████████████████████████| 1754/1754 [01:34<00:00, 18.58it/s]


epoch: 2, loss: 0.18332162160880888, dev_predicted: 0.6445146938297623


100%|███████████████████████████████████████| 1754/1754 [01:33<00:00, 18.78it/s]


epoch: 3, loss: 0.1272910212101829, dev_predicted: 0.7257369136323164


100%|███████████████████████████████████████| 1754/1754 [01:33<00:00, 18.79it/s]


epoch: 4, loss: 0.0907415469509455, dev_predicted: 0.7737432137065501


100%|███████████████████████████████████████| 1754/1754 [01:34<00:00, 18.63it/s]


epoch: 5, loss: 0.06553741172805616, dev_predicted: 0.8108947219870166


100%|███████████████████████████████████████| 1754/1754 [01:33<00:00, 18.80it/s]


epoch: 6, loss: 0.04739325542954416, dev_predicted: 0.8377075522228173


100%|███████████████████████████████████████| 1754/1754 [01:33<00:00, 18.79it/s]


epoch: 7, loss: 0.03475942272303186, dev_predicted: 0.8658748069842024


100%|███████████████████████████████████████| 1754/1754 [01:33<00:00, 18.67it/s]


epoch: 8, loss: 0.02588205385176378, dev_predicted: 0.8808677269014558


100%|███████████████████████████████████████| 1754/1754 [01:34<00:00, 18.57it/s]


epoch: 9, loss: 0.01961082221139456, dev_predicted: 0.8973195679872349


100%|███████████████████████████████████████| 1754/1754 [01:34<00:00, 18.59it/s]


epoch: 10, loss: 0.015691496136939702, dev_predicted: 0.8940330455277904


In [20]:
model_ = T5ForTokenClassification.from_pretrained('small')
_ = model_.cuda()

In [21]:
model_.push_to_hub('mesolitica/ner-nanot5-small-malaysian-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/ner-nanot5-small-malaysian-cased/commit/d430da2dd0a7f95ab2e95176a7d63e1aae594294', commit_message='Upload T5ForTokenClassification', commit_description='', oid='d430da2dd0a7f95ab2e95176a7d63e1aae594294', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('mesolitica/ner-nanot5-small-malaysian-cased')

CommitInfo(commit_url='https://huggingface.co/mesolitica/ner-nanot5-small-malaysian-cased/commit/30e2bd1a73df9f1fcb33c1c25f780ca61702c026', commit_message='Upload tokenizer', commit_description='', oid='30e2bd1a73df9f1fcb33c1c25f780ca61702c026', pr_url=None, pr_revision=None, pr_num=None)

In [23]:
dev_predicted = []
for i in range(0, len(test['input_ids']), batch_size):
    x = test['input_ids'][i: i + batch_size]
    y = test['labels'][i: i + batch_size]
    padded = padding(x, y)

    loss, pred = model_(**padded)
    predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
    dev_predicted.extend(predictions)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]

seqeval.compute(predictions=true_predictions, references=true_labels)

{'aw': {'precision': 0.7930258717660292,
  'recall': 0.9120310478654593,
  'f1': 0.8483754512635379,
  'number': 1546},
 'erson': {'precision': 0.9096938433710725,
  'recall': 0.9418088500485504,
  'f1': 0.9254728233089112,
  'number': 14418},
 'ime': {'precision': 0.9127617148554337,
  'recall': 0.9210261569416499,
  'f1': 0.9168753129694542,
  'number': 3976},
 'ocation': {'precision': 0.8999060640851686,
  'recall': 0.9325113562621674,
  'f1': 0.9159186275030541,
  'number': 9246},
 'rganization': {'precision': 0.8530937160776746,
  'recall': 0.8513480982185845,
  'f1': 0.8522200132538105,
  'number': 8308},
 'uantity': {'precision': 0.9197002141327623,
  'recall': 0.9439560439560439,
  'f1': 0.9316702819956616,
  'number': 2730},
 'vent': {'precision': 0.6219110378912686,
  'recall': 0.6565217391304348,
  'f1': 0.6387478849407783,
  'number': 1150},
 'overall_precision': 0.8843562962093651,
 'overall_recall': 0.9106685357954271,
 'overall_f1': 0.8973195679872349,
 'overall_accuracy