In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/emotion/emotion-twitter-lexicon.json

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForSequenceClassification

2023-10-06 13:58:28.904459: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-06 13:58:28.983530: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-06 13:58:29.396327: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-10-06 13:58:29.396363: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [4]:
emotion_label = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']

with open('emotion-twitter-lexicon.json') as fopen:
    emotion = json.load(fopen)
    
emotion.keys()

dict_keys(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'])

In [5]:
texts, labels = [], []

for k, v in emotion.items():
    if len(v) > 30000:
        emotion[k] = random.sample(v, 30000)
    print(k, len(emotion[k]))
    texts.extend(emotion[k])
    labels.extend([emotion_label.index(k)] * len(emotion[k]))

anger 30000
fear 20316
happy 30000
love 20783
sadness 26468
surprise 13107


In [6]:
actual_t, actual_l = [], []

for i in tqdm(range(len(texts))):
    if len(texts[i]) > 2:
        actual_t.append(texts[i])
        actual_l.append(labels[i])

100%|██████████████████████████████| 140674/140674 [00:00<00:00, 4090779.71it/s]


In [7]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    actual_t, actual_l, test_size = 0.2
)

In [8]:
len(set(actual_l))

6

In [9]:
config = T5Config.from_pretrained('mesolitica/nanot5-small-malaysian-cased')
config.num_labels = len(set(actual_l))
config.vocab = list(emotion.keys())

Downloading (…)lve/main/config.json:   0%|          | 0.00/790 [00:00<?, ?B/s]

In [10]:
model = T5ForSequenceClassification.from_pretrained('mesolitica/nanot5-small-malaysian-cased', config = config)
_ = model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/358M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mesolitica/nanot5-small-malaysian-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased')

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [13]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train_X[i: i + batch_size]
        y = np.array(train_Y[i: i + batch_size])
        
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test_X), batch_size):
        x = test_X[i: i + batch_size]
        y = np.array(test_Y[i: i + batch_size])
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        
        loss, pred = model(**padded)
        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())
        
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('small')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|███████████████████████████████████████| 7034/7034 [05:40<00:00, 20.68it/s]


epoch: 0, loss: 0.5093705902411132, dev_predicted: 0.9211199545196134


100%|███████████████████████████████████████| 7034/7034 [06:18<00:00, 18.60it/s]


epoch: 1, loss: 0.2539201259537832, dev_predicted: 0.9413018760659465


100%|███████████████████████████████████████| 7034/7034 [06:13<00:00, 18.82it/s]


epoch: 2, loss: 0.2022606245867862, dev_predicted: 0.9502202956225128


100%|███████████████████████████████████████| 7034/7034 [06:18<00:00, 18.60it/s]


epoch: 3, loss: 0.17207835702439594, dev_predicted: 0.9575042637862422


100%|███████████████████████████████████████| 7034/7034 [06:13<00:00, 18.83it/s]


epoch: 4, loss: 0.14899513813516257, dev_predicted: 0.9607021034678794


100%|███████████████████████████████████████| 7034/7034 [05:16<00:00, 22.23it/s]


epoch: 5, loss: 0.13533089537048257, dev_predicted: 0.9631893121091529


100%|███████████████████████████████████████| 7034/7034 [03:27<00:00, 33.84it/s]


epoch: 6, loss: 0.12416590529366782, dev_predicted: 0.9643973848777715


100%|███████████████████████████████████████| 7034/7034 [03:27<00:00, 33.97it/s]


epoch: 7, loss: 0.11610092148560636, dev_predicted: 0.9678794769755543


100%|███████████████████████████████████████| 7034/7034 [03:26<00:00, 34.04it/s]


epoch: 8, loss: 0.10486150778757097, dev_predicted: 0.9689098919840818


100%|███████████████████████████████████████| 7034/7034 [03:26<00:00, 34.09it/s]


epoch: 9, loss: 0.1024017584425029, dev_predicted: 0.969148460976204


100%|███████████████████████████████████████| 7034/7034 [03:25<00:00, 34.18it/s]


epoch: 10, loss: 0.09778238821261064, dev_predicted: 0.9703311540648095


100%|███████████████████████████████████████| 7034/7034 [03:26<00:00, 34.13it/s]


epoch: 11, loss: 0.09108099877250556, dev_predicted: 0.969300739056282


In [14]:
model_ = T5ForSequenceClassification.from_pretrained('small')
_ = model_.cuda()

In [15]:
real_Y = []
for i in tqdm(range(0, len(test_X), batch_size)):
    x = test_X[i: i + batch_size]
    y = np.array(test_Y[i: i + batch_size])
    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
    padded['labels'] = torch.from_numpy(y)
    for k in padded.keys():
        padded[k] = padded[k].cuda()

    loss, pred = model(**padded)
    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())

100%|███████████████████████████████████████| 1759/1759 [00:17<00:00, 98.97it/s]


In [16]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, test_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
        digits = 5
    )
)

              precision    recall  f1-score   support

       anger    0.95647   0.95266   0.95456      6020
        fear    0.95685   0.97154   0.96414      3971
       happy    0.99236   0.98256   0.98744      6079
        love    0.94842   0.96704   0.95764      4126
     sadness    0.98210   0.96899   0.97550      5321
    surprise    0.97263   0.97746   0.97504      2618

    accuracy                        0.96929     28135
   macro avg    0.96814   0.97004   0.96905     28135
weighted avg    0.96945   0.96929   0.96933     28135



In [17]:
model_.push_to_hub('mesolitica/emotion-analysis-nanot5-small-malaysian-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-small-malaysian-cased/commit/5b7e1a6e54ea155127a333c8978684d7bf551f21', commit_message='Upload T5ForSequenceClassification', commit_description='', oid='5b7e1a6e54ea155127a333c8978684d7bf551f21', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
tokenizer.push_to_hub('mesolitica/emotion-analysis-nanot5-small-malaysian-cased', safe_serialization = True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-small-malaysian-cased/commit/63d2ab0734afbef58b902c0d75a7d9d1ea49b090', commit_message='Upload tokenizer', commit_description='', oid='63d2ab0734afbef58b902c0d75a7d9d1ea49b090', pr_url=None, pr_revision=None, pr_num=None)