In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/emotion/emotion-twitter-lexicon.json

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForSequenceClassification

2023-09-23 17:55:29.829841: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-09-23 17:55:29.903210: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-23 17:55:30.344630: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-09-23 17:55:30.344666: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [4]:
emotion_label = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']

with open('emotion-twitter-lexicon.json') as fopen:
    emotion = json.load(fopen)
    
emotion.keys()

dict_keys(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'])

In [5]:
texts, labels = [], []

for k, v in emotion.items():
    if len(v) > 30000:
        emotion[k] = random.sample(v, 30000)
    print(k, len(emotion[k]))
    texts.extend(emotion[k])
    labels.extend([emotion_label.index(k)] * len(emotion[k]))

anger 30000
fear 20316
happy 30000
love 20783
sadness 26468
surprise 13107


In [6]:
actual_t, actual_l = [], []

for i in tqdm(range(len(texts))):
    if len(texts[i]) > 2:
        actual_t.append(texts[i])
        actual_l.append(labels[i])

100%|██████████████████████████████| 140674/140674 [00:00<00:00, 3918379.07it/s]


In [7]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    actual_t, actual_l, test_size = 0.2
)

In [8]:
len(set(actual_l))

6

In [9]:
config = T5Config.from_pretrained('mesolitica/nanot5-small-malaysian-cased')
config.num_labels = len(set(actual_l))
config.vocab = list(emotion.keys())

In [10]:
model = T5ForSequenceClassification.from_pretrained('mesolitica/nanot5-small-malaysian-cased', config = config)
_ = model.cuda()

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mesolitica/nanot5-small-malaysian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-base-malaysian-cased')

In [12]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [13]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train_X[i: i + batch_size]
        y = np.array(train_Y[i: i + batch_size])
        
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test_X), batch_size):
        x = test_X[i: i + batch_size]
        y = np.array(test_Y[i: i + batch_size])
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        
        loss, pred = model(**padded)
        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())
        
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('small')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|███████████████████████████████████████| 7034/7034 [03:23<00:00, 34.58it/s]


epoch: 0, loss: 0.5507507522614026, dev_predicted: 0.9220336230000812


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.40it/s]


epoch: 1, loss: 0.24752154107643692, dev_predicted: 0.9440631852513603


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.34it/s]


epoch: 2, loss: 0.1818695076146491, dev_predicted: 0.9593519044911881


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.36it/s]


epoch: 3, loss: 0.15269527590441295, dev_predicted: 0.961910176236498


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.38it/s]


epoch: 4, loss: 0.13440006111585595, dev_predicted: 0.9654633314383172


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.37it/s]


epoch: 5, loss: 0.12517107687389384, dev_predicted: 0.969300739056282


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.39it/s]


epoch: 6, loss: 0.11140553841086107, dev_predicted: 0.9694073337123366


100%|███████████████████████████████████████| 7034/7034 [03:24<00:00, 34.46it/s]


epoch: 7, loss: 0.10560762867888458, dev_predicted: 0.9736000568504832


100%|███████████████████████████████████████| 7034/7034 [03:22<00:00, 34.65it/s]


epoch: 8, loss: 0.09756111322966705, dev_predicted: 0.9733513359863559


In [17]:
model_ = T5ForSequenceClassification.from_pretrained('small')
_ = model_.cuda()

In [18]:
real_Y = []
for i in tqdm(range(0, len(test_X), batch_size)):
    x = test_X[i: i + batch_size]
    y = np.array(test_Y[i: i + batch_size])
    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
    padded['labels'] = torch.from_numpy(y)
    for k in padded.keys():
        padded[k] = padded[k].cuda()

    loss, pred = model(**padded)
    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())

100%|███████████████████████████████████████| 1759/1759 [00:19<00:00, 91.04it/s]


In [19]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, test_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
        digits = 5
    )
)

              precision    recall  f1-score   support

       anger    0.96169   0.95504   0.95835      6072
        fear    0.97648   0.96479   0.97060      4260
       happy    0.99213   0.98992   0.99102      5855
        love    0.96007   0.96789   0.96396      4173
     sadness    0.97005   0.98375   0.97685      5169
    surprise    0.98119   0.98081   0.98100      2606

    accuracy                        0.97334     28135
   macro avg    0.97360   0.97370   0.97363     28135
weighted avg    0.97336   0.97334   0.97333     28135



In [20]:
model_.push_to_hub('mesolitica/emotion-analysis-nanot5-small-malaysian-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/167M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-small-malaysian-cased/commit/e2352f0814d924928bca728cae43d1dc5f6b05b7', commit_message='Upload T5ForSequenceClassification', commit_description='', oid='e2352f0814d924928bca728cae43d1dc5f6b05b7', pr_url=None, pr_revision=None, pr_num=None)

In [21]:
tokenizer.push_to_hub('mesolitica/emotion-analysis-nanot5-small-malaysian-cased', safe_serialization = True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-small-malaysian-cased/commit/0b1c8911dce8bca325c2caf153b9e4d238012b22', commit_message='Upload tokenizer', commit_description='', oid='0b1c8911dce8bca325c2caf153b9e4d238012b22', pr_url=None, pr_revision=None, pr_num=None)