In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [2]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
import pandas as pd
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForTokenClassification

2023-10-06 15:28:04.414909: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-06 15:28:04.495384: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-06 15:28:04.972310: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-10-06 15:28:04.972343: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [3]:
labels_tag = {
    'OTHER': 0,
    'law': 1,
    'location': 2,
    'organization': 3,
    'person': 4,
    'quantity': 5,
    'time': 6,
    'event': 7,
}
label_list = list(labels_tag.keys())
label_list

['OTHER',
 'law',
 'location',
 'organization',
 'person',
 'quantity',
 'time',
 'event']

In [4]:
config = T5Config.from_pretrained('mesolitica/translation-t5-small-standard-bahasa-cased')
config.num_labels = len(labels_tag)
config.vocab = labels_tag
config.rev_vocab = {v: k for v, k in labels_tag.items()}

Downloading (…)lve/main/config.json:   0%|          | 0.00/819 [00:00<?, ?B/s]

In [5]:
model = T5ForTokenClassification.from_pretrained('mesolitica/translation-t5-small-standard-bahasa-cased', config = config)
_ = model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/242M [00:00<?, ?B/s]

Some weights of T5ForTokenClassification were not initialized from the model checkpoint at mesolitica/translation-t5-small-standard-bahasa-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/translation-t5-small-standard-bahasa-cased', add_prefix_space = True)

Downloading (…)okenizer_config.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/47.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/515 [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. If you see this, DO NOT PANIC! This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
Special tokens have been added in the vocabular

In [7]:
with open('prepared.json') as fopen:
    data = json.load(fopen)

In [8]:
train = []
for i in range(len(data['train_X'])):
    if len(data['train_X'][i]) != len(data['train_Y'][i]):
        continue
        
    train.append({
        'tokens': data['train_X'][i],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
    train.append({
        'tokens': [t.lower() for t in data['train_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['train_Y'][i]]
    })
    
random.shuffle(train)
train = pd.DataFrame(train).to_dict(orient = 'list')
len(train['tokens'])

28056

In [9]:
test = []
for i in range(len(data['test_X'])):
    test.append({
        'tokens': data['test_X'][i],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
    test.append({
        'tokens': [t.lower() for t in data['test_X'][i]],
        'ner_tags': [labels_tag[t] for t in data['test_Y'][i]]
    })
    
random.shuffle(test)
test = pd.DataFrame(test).to_dict(orient = 'list')
len(test['tokens'])

6260

In [10]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [11]:
train = tokenize_and_align_labels(train)
test = tokenize_and_align_labels(test)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
train.keys()

dict_keys(['input_ids', 'attention_mask', 'labels'])

In [13]:
def padding(x, y):
    padded = tokenizer.pad([{'input_ids': x_} for x_ in x], return_tensors = 'pt')
    sequence_length = padded['input_ids'].shape[1]
    labels = [l + [-100] * (sequence_length - len(l)) for l in y]
    labels = np.array(labels)
    padded['labels'] = torch.from_numpy(labels)
    for k in padded.keys():
        padded[k] = padded[k].cuda()
    return padded

In [14]:
# model(**padded)[0]

In [15]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [16]:
len(train['input_ids'])

28056

In [17]:
i = 0
batch_size = 5
x = test['input_ids'][i: i + batch_size]
y = test['labels'][i: i + batch_size]
padded = padding(x, y)

loss, pred = model(**padded)

You're using a T5TokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


In [18]:
import evaluate

seqeval = evaluate.load("seqeval")

Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [19]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train['input_ids']), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train['input_ids'][i: i + batch_size]
        y = train['labels'][i: i + batch_size]
        padded = padding(x, y)
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test['input_ids']), batch_size):
        x = test['input_ids'][i: i + batch_size]
        y = test['labels'][i: i + batch_size]
        padded = padding(x, y)
        
        loss, pred = model(**padded)
        predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
        dev_predicted.extend(predictions)
    
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(dev_predicted, test['labels'])
    ]
    
    dev_predicted = seqeval.compute(predictions=true_predictions, references=true_labels)['overall_f1']
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('small')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|███████████████████████████████████████| 1754/1754 [01:01<00:00, 28.35it/s]


epoch: 0, loss: 0.1218618100518074, dev_predicted: 0.9376437332311232


100%|███████████████████████████████████████| 1754/1754 [01:02<00:00, 28.03it/s]


epoch: 1, loss: 0.006563813444881402, dev_predicted: 0.9584140312045135


100%|███████████████████████████████████████| 1754/1754 [01:03<00:00, 27.70it/s]


epoch: 2, loss: 0.0033261581134998736, dev_predicted: 0.9616726421130144


100%|███████████████████████████████████████| 1754/1754 [01:03<00:00, 27.52it/s]


epoch: 3, loss: 0.0022853419720699655, dev_predicted: 0.9660306446949986


100%|███████████████████████████████████████| 1754/1754 [01:04<00:00, 27.37it/s]


epoch: 4, loss: 0.002028393760076843, dev_predicted: 0.9640563434589293


In [20]:
model_ = T5ForTokenClassification.from_pretrained('small')
_ = model_.cuda()

In [21]:
model_.push_to_hub('mesolitica/ner-t5-small-standard-bahasa-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/141M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/ner-t5-small-standard-bahasa-cased/commit/a67dee31efe7aa0a02842649c9a37afd6cb020cd', commit_message='Upload T5ForTokenClassification', commit_description='', oid='a67dee31efe7aa0a02842649c9a37afd6cb020cd', pr_url=None, pr_revision=None, pr_num=None)

In [22]:
tokenizer.push_to_hub('mesolitica/ner-t5-small-standard-bahasa-cased')

spiece.model:   0%|          | 0.00/803k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/ner-t5-small-standard-bahasa-cased/commit/d37635d8064a572063c1967be8119deeac9806f3', commit_message='Upload tokenizer', commit_description='', oid='d37635d8064a572063c1967be8119deeac9806f3', pr_url=None, pr_revision=None, pr_num=None)

In [24]:
dev_predicted = []
for i in range(0, len(test['input_ids']), batch_size):
    x = test['input_ids'][i: i + batch_size]
    y = test['labels'][i: i + batch_size]
    padded = padding(x, y)

    loss, pred = model_(**padded)
    predictions = pred.detach().cpu().numpy().argmax(axis = 2).tolist()
    dev_predicted.extend(predictions)

true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(dev_predicted, test['labels'])
]

seqeval.compute(predictions=true_predictions, references=true_labels)

{'aw': {'precision': 0.9320327249842668,
  'recall': 0.9579560155239327,
  'f1': 0.9448165869218501,
  'number': 1546},
 'erson': {'precision': 0.9745341614906833,
  'recall': 0.9794007490636704,
  'f1': 0.976961394769614,
  'number': 14418},
 'ime': {'precision': 0.9583539910758553,
  'recall': 0.9723340040241448,
  'f1': 0.9652933832709114,
  'number': 3976},
 'ocation': {'precision': 0.9709677419354839,
  'recall': 0.9766385463984426,
  'f1': 0.9737948883856357,
  'number': 9246},
 'rganization': {'precision': 0.9493625210488333,
  'recall': 0.9500481463649495,
  'f1': 0.9497052099627,
  'number': 8308},
 'uantity': {'precision': 0.9823008849557522,
  'recall': 0.9758241758241758,
  'f1': 0.9790518191841234,
  'number': 2730},
 'vent': {'precision': 0.8669991687448046,
  'recall': 0.9069565217391304,
  'f1': 0.88652783680408,
  'number': 1150},
 'overall_precision': 0.9629220498535133,
 'overall_recall': 0.9691593754531832,
 'overall_f1': 0.9660306446949986,
 'overall_accuracy': 0.9