In [22]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
from transformers import AutoTokenizer, AutoConfig, AutoModel
from classifier import MistralForSequenceClassification
import os

In [23]:
data = []
texts, labels = [], []

labels_sentiment = ['racist', 'religion insult','psychiatric or mental illness', 'sexist','harassment','hate','porn','safe for work']


In [24]:
data = []

with open('malaysian-sfw-dataset.jsonl') as f:
    
    for x in f:
        data.append(json.loads(x))

In [25]:
random.shuffle(data)

In [26]:
for l in data:
    if l['label'] not in labels_sentiment:
        continue
    if len(l['text']) > 5:
        texts.append(l['text'])
        labels.append(labels_sentiment.index(l['label']))
        
len(texts)

209986

In [27]:
from sklearn.model_selection import train_test_split

train_X, test_X, train_Y, test_Y = train_test_split(
    texts, labels, test_size = 0.1
)

In [28]:
list(zip(texts[:10], labels[:10]))

[('Kaka simpati dgn Emilia..  Dia ni mangsa sexual abuse..  Kat thread kenamaan sana  ramai je yg mengata dia kak normah  kata dia kuat drama  berdendam lah  heran betul la  yg mengata tu perempuan kot',
  3),
 ('Aku suka baca confession kat sini yg mostly cerita tentang betapa rendah dirinya mereka kerana gemuk la, hitam la, juling la, macam-macam. Ada yang cerita walaupun fizikal kurang menarik tapi masih ada yang berkenan lantas jatuh cinta hingga ke jinjang pelamin. Ada jugak yang tercari2 erti cinta sejati dek kerana rasa dirinya tidak sesempurna yang lain. Ermm penting sangat ke rupa dan fizikal nih?',
  2),
 ('Bukan sekadar berak busuk, diorang geledah sampah, plastik sampah dekat dapur koyak-koyak, berlemoih woih dapur kitorang. Dengan tulang-tulang ayam diorang ratah sepah-sepah.',
  4),
 ('Kene jackpot siaaa  dapat mamat badan athletic plus big ', 6),
 ("Then he started giving me a b*****b. Takut, confused, semuanyalah. I didn't even c*m mungkin kerana terlalu terkejut. He on

In [29]:
np.unique(labels, return_counts = True)

(array([0, 1, 2, 3, 4, 5, 6, 7]),
 array([18673, 34037, 53324, 15647, 24716, 22924,   667, 39998]))

In [30]:
id2label = {i: label for i, label in enumerate(labels_sentiment)}
label2id = {label: i for i, label in enumerate(labels_sentiment)}

In [31]:
config = AutoConfig.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512')
config.num_labels = len(set(labels))
config.vocab = labels_sentiment



In [32]:
model = MistralForSequenceClassification.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512', config = config)
_ = model.cuda()

Some weights of MistralForSequenceClassification were not initialized from the model checkpoint at mesolitica/malaysian-mistral-191M-MLM-512 and are newly initialized: ['score.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [33]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-mistral-191M-MLM-512')

In [34]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-4)

In [None]:
batch_size = 20
epoch = 100


best_dev_acc = -np.inf
patient = 3
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train_X[i: i + batch_size]
        y = np.array(train_Y[i: i + batch_size])
        
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')

        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        
        padded.pop('token_type_ids', None)

        loss, pred = model(**padded, return_dict = False)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 5.0)
        trainer.step()
        losses.append(float(loss))
        
    dev_predicted = []
    for i in range(0, len(test_X), batch_size):
        x = test_X[i: i + batch_size]
        y = np.array(test_Y[i: i + batch_size])
        padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        
        padded.pop('token_type_ids', None)

        loss, pred = model(**padded, return_dict = False)
        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())
        
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('porn-mistral-mlm')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|██████████| 9450/9450 [36:28<00:00,  4.32it/s] 


epoch: 0, loss: 0.6004894968402134, dev_predicted: 0.8203784461152882
[2024-05-31 12:55:25,516] [INFO] [real_accelerator.py:191:get_accelerator] Setting ds_accelerator to cuda (auto detect)


 61%|██████▏   | 5798/9450 [22:21<13:00,  4.68it/s] 

In [43]:
padded = tokenizer(texts[:10], padding = True, return_tensors = 'pt')
for k in padded.keys():
    padded[k] = padded[k].cuda()

In [44]:
batch_size = 20
epoch = 100

real_Y = []
for i in tqdm(range(0, len(test_X), batch_size)):
    x = test_X[i: i + batch_size]
    y = np.array(test_Y[i: i + batch_size])
    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
    padded['labels'] = torch.from_numpy(y)
    for k in padded.keys():
        padded[k] = padded[k].cuda()
    padded.pop('token_type_ids', None)

    loss, pred = model(**padded,return_dict=False)
    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())

100%|██████████| 4/4 [00:00<00:00, 19.16it/s]


In [45]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, test_Y, target_names = config.vocab,
        digits = 5
    )
)

              precision    recall  f1-score   support

    non porn    0.93939   0.93939   0.93939        33
        porn    0.95556   0.95556   0.95556        45

    accuracy                        0.94872        78
   macro avg    0.94747   0.94747   0.94747        78
weighted avg    0.94872   0.94872   0.94872        78



In [21]:
from transformers import pipeline

pipe = pipeline("text-classification",
                        tokenizer = tokenizer,
                        model=model)

2024-05-17 23:53:57.253255: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [22]:
labels_sentiment

['racist',
 'religion insult',
 'psychiatric or mental illness',
 'sexist',
 'harassment',
 'informative',
 'safe for work',
 'hate']

In [27]:
label2id = {'racist':0,'religion insult':1,'psychiatric or mental illness':2,
            'sexist':3,'harassment':4,'informative':5,'safe for work':6,'hate':7}

In [28]:
id2label = {0:'racist',1:'religion insult',2:'psychiatric or mental illness',
            3:'sexist',4:'harassment',5:'informative',6:'safe for work',7:'hate'}

In [29]:
model.config.label2id = label2id
model.config.id2label = id2label

In [47]:
pipe(['macam bodoh betul la engkau ni'])

[{'label': 'hate', 'score': 0.9171728491783142}]

In [48]:
pipe('melayu memang rasa tertindas padahal yahudi')

[{'label': 'racist', 'score': 0.9956732392311096}]

In [42]:
pipe('')

[{'label': 'safe for work', 'score': 0.5028830170631409}]

In [34]:
pipe('perempuan memang tak pandai macam lelaki')

[{'label': 'sexist', 'score': 0.9981914162635803}]

In [35]:
pipe('lelaki semua sampah')

[{'label': 'hate', 'score': 0.5682838559150696}]

In [36]:
pipe('hidup ni tak bermakna, takde sebab nak hidup lagi')

[{'label': 'hate', 'score': 0.49550363421440125}]

In [37]:
pipe('aku nampak buaya nak makan aku, hantu takda kepala cakap dgn aku, aku tak boleh tido')

[{'label': 'psychiatric or mental illness', 'score': 0.9957693815231323}]

In [38]:
pipe('perempuan gila tu nak suruh semua orang dgr cakap dia')

[{'label': 'harassment', 'score': 0.5153989195823669}]

In [46]:
pipe('saya nak bunuh diri, tak da makna hidup ni')

[{'label': 'psychiatric or mental illness', 'score': 0.9803110361099243}]

In [40]:
pipe('mak cik tu jual nasi lemak sedap')

[{'label': 'informative', 'score': 0.8735892176628113}]

In [64]:
pipe('amat penting untuk semua pelajar cakna akan isu semasa')

[{'label': 'informative', 'score': 0.7604817748069763}]

In [79]:
from huggingface_hub import create_repo
create_repo("malaysia-ai/malaysian-sfw-classifier", repo_type="model")

RepoUrl('https://huggingface.co/malaysia-ai/malaysian-sfw-classifier', endpoint='https://huggingface.co', repo_type='model', repo_id='malaysia-ai/malaysian-sfw-classifier')

In [43]:
model.push_to_hub('malaysia-ai/malaysian-sfw-classifier', safe_serialization = True)

README.md:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/665M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/malaysia-ai/malaysian-sfw-classifier/commit/4196e7859db5d357a2fa57200c205b47fe939a69', commit_message='Upload MistralForSequenceClassification', commit_description='', oid='4196e7859db5d357a2fa57200c205b47fe939a69', pr_url=None, pr_revision=None, pr_num=None)

In [44]:
tokenizer.push_to_hub('malaysia-ai/malaysian-sfw-classifier', safe_serialization = True)

README.md:   0%|          | 0.00/1.30k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/malaysia-ai/malaysian-sfw-classifier/commit/616f734b3ca3ea0e4f779acc961ad7349ccf08eb', commit_message='Upload tokenizer', commit_description='', oid='616f734b3ca3ea0e4f779acc961ad7349ccf08eb', pr_url=None, pr_revision=None, pr_num=None)