In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '0'

In [2]:
from tqdm import tqdm
import json
import random
import torch
import numpy as np
from transformers import AutoTokenizer, AutoConfig, DebertaV2ForSequenceClassification

In [3]:
config = AutoConfig.from_pretrained('mesolitica/malaysian-debertav2-base')

In [4]:
config.problem_type = "single_label_classification"
config.label2id = {'contradiction': 0, 'entailment': 1}

In [5]:
model = DebertaV2ForSequenceClassification.from_pretrained('mesolitica/malaysian-debertav2-base', config = config)

Some weights of DebertaV2ForSequenceClassification were not initialized from the model checkpoint at mesolitica/malaysian-debertav2-base and are newly initialized: ['classifier.bias', 'classifier.weight', 'pooler.dense.bias', 'pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
_ = model.cuda()

In [7]:
tokenizer = AutoTokenizer.from_pretrained('mesolitica/malaysian-debertav2-base')

In [8]:
trainable_parameters = [param for param in model.parameters() if param.requires_grad]
trainer = torch.optim.AdamW(trainable_parameters, lr = 2e-5)

In [9]:
train_X, train_Y = [], []
with open('shuffled-train.json') as fopen:
    for l in fopen:
        l = json.loads(l)
        train_X.append(l['src'])
        train_Y.append(l['label'])

In [10]:
test_X, test_Y = [], []
with open('shuffled-test.json') as fopen:
    for l in fopen:
        l = json.loads(l)
        test_X.append(l['src'])
        test_Y.append(l['label'])
        
len(test_X)

16037

In [11]:
batch_size = 8
epoch = 100

In [12]:
best_dev_acc = -np.inf
patient = 1
current_patient = 0

for e in range(epoch):
    pbar = tqdm(range(0, len(train_X), batch_size))
    losses = []
    for i in pbar:
        trainer.zero_grad()
        x = train_X[i: i + batch_size]
        y = np.array(train_Y[i: i + batch_size])
        
        padded = tokenizer(x, truncation = True, padding = True, return_tensors = 'pt', max_length = 1024)
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
        
        padded.pop('token_type_ids', None)
            
        loss, pred = model(**padded, return_dict = False)
        loss.backward()
        
        grad_norm = torch.nn.utils.clip_grad_norm_(trainable_parameters, 1.0)
        trainer.step()
        losses.append(float(loss))
        pbar.set_postfix(loss = float(loss))
        
        
    dev_predicted = []
    for i in range(0, len(test_X[:10000]), batch_size):
        x = test_X[i: i + batch_size]
        y = np.array(test_Y[i: i + batch_size])
        padded = tokenizer(x, truncation = True, padding = True, return_tensors = 'pt', max_length = 1024)
        padded['labels'] = torch.from_numpy(y)
        for k in padded.keys():
            padded[k] = padded[k].cuda()
            
        padded.pop('token_type_ids', None)
        
        loss, pred = model(**padded, return_dict = False)
        dev_predicted.append((pred.argmax(axis = 1).detach().cpu().numpy() == y).mean())
        
    dev_predicted = np.mean(dev_predicted)
    
    print(f'epoch: {e}, loss: {np.mean(losses)}, dev_predicted: {dev_predicted}')
    
    if dev_predicted >= best_dev_acc:
        best_dev_acc = dev_predicted
        current_patient = 0
        model.save_pretrained('small')
    else:
        current_patient += 1
    
    if current_patient >= patient:
        break

100%|██████████████████████████████████████████████████████████| 136696/136696 [3:39:06<00:00, 10.40it/s, loss=0.0108]


epoch: 0, loss: 0.3771513450757229, dev_predicted: 0.8642


100%|██████████████████████████████████████████████████████████| 136696/136696 [3:39:28<00:00, 10.38it/s, loss=0.0225]


epoch: 1, loss: 0.32527882952862147, dev_predicted: 0.8572


In [13]:
real_Y = []
for i in tqdm(range(0, len(test_X), batch_size)):
    x = test_X[i: i + batch_size]
    y = np.array(test_Y[i: i + batch_size])
    padded = tokenizer(x, padding = 'longest', return_tensors = 'pt')
    padded['labels'] = torch.from_numpy(y)
    for k in padded.keys():
        padded[k] = padded[k].cuda()

    loss, pred = model(**padded, return_dict = False)
    real_Y.extend(pred.argmax(axis = 1).detach().cpu().numpy().tolist())

100%|█████████████████████████████████████████████████████████████████████████████| 2005/2005 [00:28<00:00, 70.29it/s]


In [14]:
from sklearn import metrics

print(
    metrics.classification_report(
        real_Y, test_Y,
        digits = 5
    )
)

              precision    recall  f1-score   support

           0    0.88042   0.83329   0.85621      8146
           1    0.83692   0.88316   0.85942      7891

    accuracy                        0.85783     16037
   macro avg    0.85867   0.85823   0.85781     16037
weighted avg    0.85901   0.85783   0.85779     16037



In [16]:
padded

{'input_ids': tensor([[ 3657,    20,    29,  1541,   863,   482,  6558,  1523, 11768,    15,
          7705,  1816,    21,    15,   704,  5359,   436, 15246,  3534, 13870,
         28305,   365, 15559,    17,  3657,    21,    29, 15577, 19512,   418,
          5079,  1267, 13904,  6332,   331, 20385,  8406,  8948,    17,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
        [ 3657,    20,    29,  8080, 16965,    17,  3657,    21,    29, 23562,
           954,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
             0],
    

In [17]:
tokenizer.model_input_names = ['input_ids', 'attention_mask']

In [18]:
tokenizer.push_to_hub('mesolitica/finetune-mnli-malaysian-debertav2-base', safe_serialization = True)

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-mnli-malaysian-debertav2-base/commit/12af7951dbbadfc331b35009457e0c4aca40bb75', commit_message='Upload tokenizer', commit_description='', oid='12af7951dbbadfc331b35009457e0c4aca40bb75', pr_url=None, pr_revision=None, pr_num=None)

In [19]:
model.push_to_hub('mesolitica/finetune-mnli-malaysian-debertav2-base', safe_serialization = True)

README.md:   0%|          | 0.00/5.17k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/443M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/mesolitica/finetune-mnli-malaysian-debertav2-base/commit/4378e43229b01a9bd8de16fa672b0b6a4fae2d42', commit_message='Upload DebertaV2ForSequenceClassification', commit_description='', oid='4378e43229b01a9bd8de16fa672b0b6a4fae2d42', pr_url=None, pr_revision=None, pr_num=None)