In [1]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/emotion/emotion-twitter-lexicon.json

In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '1'

In [3]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
from transformers import AutoTokenizer, T5Config
from malaya.torch_model.t5 import T5ForSequenceClassification

2023-10-06 13:59:17.587988: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-10-06 13:59:17.670756: I tensorflow/core/util/port.cc:104] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-06 13:59:18.096194: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; dlerror: libnvinfer.so.7: cannot open shared object file: No such file or directory
2023-10-06 13:59:18.096252: W tensorflow/compiler/xla/stream_executor/platform/default/dso_loader.cc:64] Could not l

In [4]:
emotion_label = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise']

with open('emotion-twitter-lexicon.json') as fopen:
    emotion = json.load(fopen)
    
emotion.keys()

dict_keys(['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'])

In [5]:
texts, labels = [], []

for k, v in emotion.items():
    if len(v) > 30000:
        emotion[k] = random.sample(v, 30000)
    print(k, len(emotion[k]))
    texts.extend(emotion[k])
    labels.extend([emotion_label.index(k)] * len(emotion[k]))

anger 30000
fear 20316
happy 30000
love 20783
sadness 26468
surprise 13107


In [6]:
actual_t, actual_l = [], []

for i in tqdm(range(len(texts))):
    if len(texts[i]) > 2:
        actual_t.append(texts[i])
        actual_l.append(labels[i])

100%|██████████████████████████████| 140674/140674 [00:00<00:00, 3901356.95it/s]


In [7]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    actual_t, actual_l, test_size = 0.2
)

In [8]:
len(set(actual_l))

6

In [9]:
config = T5Config.from_pretrained('mesolitica/nanot5-tiny-malaysian-cased')
config.num_labels = len(set(actual_l))
config.vocab = list(emotion.keys())

In [10]:
model = T5ForSequenceClassification.from_pretrained('mesolitica/nanot5-tiny-malaysian-cased', config = config)
_ = model.cuda()

Downloading pytorch_model.bin:   0%|          | 0.00/205M [00:00<?, ?B/s]

Some weights of T5ForSequenceClassification were not initialized from the model checkpoint at mesolitica/nanot5-tiny-malaysian-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [11]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/nanot5-tiny-malaysian-cased')

Downloading (…)okenizer_config.json:   0%|          | 0.00/754 [00:00<?, ?B/s]

Downloading (…)olve/main/vocab.json:   0%|          | 0.00/517k [00:00<?, ?B/s]

Downloading (…)olve/main/merges.txt:   0%|          | 0.00/303k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/1.38M [00:00<?, ?B/s]

Downloading (…)in/added_tokens.json:   0%|          | 0.00/2.59k [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/15.1k [00:00<?, ?B/s]

Loading the tokenizer from the `special_tokens_map.json` and the `added_tokens.json` will be removed in `transformers 5`,  it is kept for forward compatibility, but it is recommended to update your `tokenizer_config.json` by uploading it again. You will see the new `added_tokens_decoder` attribute that will store the relevant information.
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [12]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [13]:
batch_size = 16
epoch = 100

best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train_X[i: i + batch_size]
        y = np.array(train_Y[i: i + batch_size])
        
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
            
        loss, pred = model(**padded)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test_X), batch_size):
        x = test_X[i: i + batch_size]
        y = np.array(test_Y[i: i + batch_size])
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        
        loss, pred = model(**padded)
        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())
        
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('tiny')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|███████████████████████████████████████| 7034/7034 [04:11<00:00, 27.93it/s]


epoch: 0, loss: 0.4284086830315294, dev_predicted: 0.9211910176236499


100%|███████████████████████████████████████| 7034/7034 [04:07<00:00, 28.44it/s]


epoch: 1, loss: 0.2390966095176911, dev_predicted: 0.9408754974417283


100%|███████████████████████████████████████| 7034/7034 [04:07<00:00, 28.43it/s]


epoch: 2, loss: 0.2002426634256753, dev_predicted: 0.948230528709494


100%|███████████████████████████████████████| 7034/7034 [04:12<00:00, 27.88it/s]


epoch: 3, loss: 0.18230438527817894, dev_predicted: 0.9508953951108584


100%|███████████████████████████████████████| 7034/7034 [04:07<00:00, 28.43it/s]


epoch: 4, loss: 0.17600824633331674, dev_predicted: 0.951925810119386


100%|███████████████████████████████████████| 7034/7034 [04:07<00:00, 28.47it/s]


epoch: 5, loss: 0.1589466881112779, dev_predicted: 0.9555500284252416


100%|███████████████████████████████████████| 7034/7034 [04:11<00:00, 27.92it/s]


epoch: 6, loss: 0.15255507120910203, dev_predicted: 0.9616614553723707


100%|███████████████████████████████████████| 7034/7034 [04:06<00:00, 28.50it/s]


epoch: 7, loss: 0.14433976132944518, dev_predicted: 0.9613772029562251


In [14]:
model_ = T5ForSequenceClassification.from_pretrained('tiny')
_ = model_.cuda()

In [15]:
real_Y = []
for i in tqdm(range(0, len(test_X), batch_size)):
    x = test_X[i: i + batch_size]
    y = np.array(test_Y[i: i + batch_size])
    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
    padded['labels'] = torch.from_numpy(y)
    for k in padded.keys():
        padded[k] = padded[k].cuda()

    loss, pred = model(**padded)
    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())

100%|███████████████████████████████████████| 1759/1759 [00:24<00:00, 70.81it/s]


In [16]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, test_Y, target_names = ['anger', 'fear', 'happy', 'love', 'sadness', 'surprise'],
        digits = 5
    )
)

              precision    recall  f1-score   support

       anger    0.95502   0.92475   0.93964      6153
        fear    0.95528   0.95953   0.95740      4052
       happy    0.97881   0.98278   0.98079      5923
        love    0.93801   0.96025   0.94900      4176
     sadness    0.96716   0.97755   0.97233      5212
    surprise    0.97211   0.97136   0.97173      2619

    accuracy                        0.96136     28135
   macro avg    0.96107   0.96270   0.96182     28135
weighted avg    0.96138   0.96136   0.96129     28135



In [17]:
model_.push_to_hub('mesolitica/emotion-analysis-nanot5-tiny-malaysian-cased', safe_serialization = True)

model.safetensors:   0%|          | 0.00/93.0M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-tiny-malaysian-cased/commit/01a4fcb9c6ed739596680c0d9079cc6ece2ca067', commit_message='Upload T5ForSequenceClassification', commit_description='', oid='01a4fcb9c6ed739596680c0d9079cc6ece2ca067', pr_url=None, pr_revision=None, pr_num=None)

In [18]:
tokenizer.push_to_hub('mesolitica/emotion-analysis-nanot5-tiny-malaysian-cased', safe_serialization = True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/emotion-analysis-nanot5-tiny-malaysian-cased/commit/34a5cb911777bc2bbb766f57f007bb9fd2473f40', commit_message='Upload tokenizer', commit_description='', oid='34a5cb911777bc2bbb766f57f007bb9fd2473f40', pr_url=None, pr_revision=None, pr_num=None)