In [5]:
import json
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset, random_split
from torch import Generator

In [6]:
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [7]:
from os.path import expanduser
home = expanduser("~")
path = home + "/vlp/dat/nli/XNLI-1.0/vi.tok.jsonl"
print(path)

/Users/phuonglh/vlp/dat/nli/XNLI-1.0/vi.tok.jsonl


In [8]:
class VietnameseXNLI(Dataset):
    def __init__(self, jsonlPath):
        self.X = []
        self.y = []
        with open(jsonlPath) as f:
            for line in f:
                sample = json.loads(line)
                self.X.append(sample["sentence1_tokenized"] +
                              " </s> " + sample["sentence2_tokenized"])
                self.y.append(sample["gold_label"])

    def __len__(self):
        return len(self.y)

    def __getitem__(self, i):
        return self.X[i], self.y[i]


In [9]:
dataset = VietnameseXNLI(path)
N = int(0.8*len(dataset))
training, test = random_split(dataset, [N, len(dataset)-N], generator=Generator().manual_seed(12345))

In [10]:
batch_size=32

In [11]:
# for testing only
train_loader = DataLoader(training, batch_size=batch_size)
test_loader = DataLoader(test, batch_size=batch_size)

In [12]:
next(iter(train_loader))

[('Xét về sự phù_hợp , hình_ảnh , đột_biến , tái tổ_hợp và chọn_lọc có_thể cùng đưa các quần_thể phát_triển đạt đến mức phù_hợp cao nhất . </s> Nếu khỏe_mạnh hơn , dân_số có nhiều khả_năng sống_sót hơn .',
  'Và cô ấy ngồi xuống , và , bạn biết đấy , họ vẫn đang trò_chuyện , và họ vẫn có_thể nhìn thấy người này , và người này đang đi rất nhanh . </s> Cô ấy liên_tục nói_chuyện .',
  'Chúng_tôi có_thể nhắm mục_tiêu chính_xác hơn và đạt được mục_tiêu thường_xuyên hơn . </s> chúng_tôi sẽ cố_gắng đạt được mục_tiêu gây quỹ hàng năm .',
  'Đây không phải là để nói rằng kiến \u200b_\u200b trúc tốt chỉ đơn_thuần là tiện_dụng . </s> Kiến_trúc tốt phải bao_gồm vẻ đẹp thẩm_mỹ và chức_năng có tính kinh_tế .',
  'Anh_ấy thể đúng , và anh_ấy có_thể sai . </s> Anh_ta có_thể đúng , và anh_ta có_thể sai .',
  "' Một ví_dụ khác đến từ VIP ' Vasoactive_Intestinal_Poly - peptide </s> VIP là ví_dụ tốt nhất .",
  'Việc đóng_băng tài_sản trên toàn thế_giới chưa được thực_thi đầy_đủ và đã dễ_dàng bị phá vỡ , t

In [13]:
from transformers import AutoModel, AutoTokenizer

In [14]:
bert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.dense.bias', 'lm_head.dense.weight', 'lm_head.decoder.bias', 'lm_head.layer_norm.weight', 'lm_head.bias', 'lm_head.layer_norm.bias', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [15]:
tokenizer

PreTrainedTokenizer(name_or_path='vinai/phobert-base', vocab_size=64000, model_max_len=256, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [16]:
bert

RobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(64001, 768, padding_idx=1)
    (position_embeddings): Embedding(258, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): RobertaEncoder(
    (layer): ModuleList(
      (0): RobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
            (dropout): Drop

In [17]:
label_dict = {"entailment": 0, "neutral": 1, "contradiction": 2}

In [18]:
import transformers
from transformers import (
    BertPreTrainedModel,
    RobertaConfig
)
from transformers.models.roberta.modeling_roberta import (
    RobertaClassificationHead,
    RobertaConfig,
    RobertaModel,
)
import torchtext
from torch.nn import CrossEntropyLoss


In [19]:
class RobertaPhoBERTClassifier(BertPreTrainedModel):
    def __init__(self, config):
        super(RobertaPhoBERTClassifier, self).__init__(config)
        self.num_labels = 3
        self.model = RobertaModel(config)
        self.classifier = RobertaClassificationHead(config)

    def forward(self, input_ids, attention_mask, labels):
        outputs = self.model(input_ids, attention_mask=attention_mask)
        sequence_output = outputs[0]
        logits = self.classifier(sequence_output)

        outputs = (logits,) + outputs[2:]
        loss_fct = CrossEntropyLoss()
        loss = loss_fct(logits.view(-1, self.num_labels), labels.view(-1))
        outputs = (loss,) + outputs
        return outputs  # (loss,) logits, (hidden_states, (attentions))


In [20]:
# https://pchanda.github.io/Roberta-FineTuning-for-Classification/
config_class = RobertaConfig
model_class = RobertaPhoBERTClassifier
config = config_class.from_pretrained("vinai/phobert-base", num_labels=3)
model = model_class.from_pretrained("vinai/phobert-base", config=config)

model

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaPhoBERTClassifier: ['roberta.encoder.layer.3.output.dense.bias', 'roberta.encoder.layer.8.attention.self.query.bias', 'roberta.encoder.layer.5.attention.output.dense.weight', 'roberta.encoder.layer.1.attention.output.dense.weight', 'roberta.encoder.layer.0.attention.self.value.weight', 'roberta.encoder.layer.0.intermediate.dense.weight', 'roberta.encoder.layer.8.attention.self.key.weight', 'roberta.encoder.layer.11.attention.self.query.bias', 'roberta.encoder.layer.6.output.LayerNorm.bias', 'roberta.encoder.layer.10.output.dense.weight', 'roberta.encoder.layer.4.intermediate.dense.bias', 'roberta.encoder.layer.4.output.LayerNorm.bias', 'roberta.encoder.layer.7.attention.output.LayerNorm.weight', 'lm_head.decoder.weight', 'roberta.encoder.layer.0.attention.self.key.weight', 'roberta.encoder.layer.5.output.LayerNorm.weight', 'roberta.encoder.layer.4.attention.self.key.weight', 'roberta.encod

RobertaPhoBERTClassifier(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [21]:
class ClassificationDataset(Dataset):
    def __init__(self, data, tokenizer):
        xs, ys = data
        # this is like calling tokenizer.encode() but has paddings
        self.examples = tokenizer(text=xs, text_pair=None, truncation=True, padding='max_length',
                                  max_length=tokenizer.model_max_length, return_tensors='pt')
        self.labels = torch.tensor([label_dict[y] for y in ys], dtype=torch.long)

    def __len__(self):
        return len(self.examples["input_ids"])

    def __getitem__(self, index):
        return {key: self.examples[key][index] for key in self.examples}, self.labels[index]


In [22]:
# each data is a pair of (xs, ys)
training_data = ([x[0] for x in training], [x[1] for x in training])
test_data = ([x[0] for x in test], [x[1] for x in test])

In [23]:
training_ds = ClassificationDataset(training_data, tokenizer)
test_ds = ClassificationDataset(test_data, tokenizer)

In [24]:
# create data loader
train_loader = DataLoader(training_ds, batch_size=batch_size)
test_loader = DataLoader(test_ds, batch_size=batch_size)

In [25]:
first_test_batch = next(iter(test_loader))

In [26]:
# xs in the first batch
first_test_batch[0]

{'input_ids': tensor([[    0, 14583,   599,  ...,     1,     1,     1],
         [    0,  6177,     4,  ...,     1,     1,     1],
         [    0,    70,  1080,  ...,     1,     1,     1],
         ...,
         [    0,    92,   573,  ...,     1,     1,     1],
         [    0,   146,    69,  ...,     1,     1,     1],
         [    0,   404,    16,  ...,     1,     1,     1]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]])}

In [27]:
# ys in the first batch
first_test_batch[1]

tensor([0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 2,
        0, 0, 2, 1, 2, 1, 1, 0])

In [28]:
def get_inputs_dict(batch):
    inputs = {key: value.squeeze(1).to(DEVICE) for key, value in batch[0].items()}
    inputs["labels"] = batch[1].to(DEVICE)
    return inputs

In [29]:
get_inputs_dict(first_test_batch)

{'input_ids': tensor([[    0, 14583,   599,  ...,     1,     1,     1],
         [    0,  6177,     4,  ...,     1,     1,     1],
         [    0,    70,  1080,  ...,     1,     1,     1],
         ...,
         [    0,    92,   573,  ...,     1,     1,     1],
         [    0,   146,    69,  ...,     1,     1,     1],
         [    0,   404,    16,  ...,     1,     1,     1]]),
 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         ...,
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0],
         [0, 0, 0,  ..., 0, 0, 0]]),
 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         ...,
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0],
         [1, 1, 1,  ..., 0, 0, 0]]),
 'labels': tensor([0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 2,
         0, 0, 2, 1, 2, 1, 1, 0])}

In [30]:
batch = get_inputs_dict(first_test_batch)
first_input_ids = batch['input_ids'].to(DEVICE)
first_attention_mask = batch['attention_mask'].to(DEVICE)
first_labels = batch['labels'].to(DEVICE)

In [31]:
print(batch)

{'input_ids': tensor([[    0, 14583,   599,  ...,     1,     1,     1],
        [    0,  6177,     4,  ...,     1,     1,     1],
        [    0,    70,  1080,  ...,     1,     1,     1],
        ...,
        [    0,    92,   573,  ...,     1,     1,     1],
        [    0,   146,    69,  ...,     1,     1,     1],
        [    0,   404,    16,  ...,     1,     1,     1]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]]), 'labels': tensor([0, 1, 2, 2, 1, 0, 1, 2, 0, 2, 1, 2, 2, 0, 1, 0, 2, 1, 1, 1, 0, 0, 0, 2,
        0, 0, 2, 1, 2, 1, 1, 0])}


In [32]:
model.to(DEVICE)

RobertaPhoBERTClassifier(
  (model): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): Laye

In [33]:
# for testing
model(first_input_ids, first_attention_mask, first_labels)

(tensor(1.1009, grad_fn=<NllLossBackward0>),
 tensor([[ 0.0988,  0.0270,  0.1320],
         [ 0.0805, -0.0810,  0.1058],
         [ 0.0128,  0.0049,  0.2011],
         [ 0.0976, -0.0253,  0.1851],
         [ 0.0664,  0.0472,  0.1684],
         [ 0.0420, -0.0543,  0.1687],
         [ 0.0374,  0.0303,  0.1037],
         [ 0.0787,  0.0358,  0.1532],
         [ 0.0409, -0.0191,  0.1807],
         [ 0.0458,  0.0105,  0.1269],
         [-0.0136,  0.0020,  0.0582],
         [ 0.0299, -0.0007,  0.1985],
         [ 0.0341, -0.0518,  0.1227],
         [ 0.0393,  0.0416,  0.1328],
         [ 0.0436, -0.0435,  0.1576],
         [ 0.0381,  0.0165,  0.1424],
         [ 0.0691, -0.0125,  0.1356],
         [ 0.0234,  0.0403,  0.1408],
         [ 0.0495,  0.0379,  0.1001],
         [ 0.0414, -0.0012,  0.1190],
         [ 0.0259,  0.0264,  0.2386],
         [ 0.0671, -0.0063,  0.1759],
         [ 0.0332,  0.0094,  0.1255],
         [ 0.0418,  0.0596,  0.1291],
         [ 0.0331, -0.0142,  0.1311],
     

In [34]:
import math
import time
import numpy as np
from transformers.optimization import (
    AdamW, 
    get_linear_schedule_with_warmup
)
from torch.nn import CrossEntropyLoss

In [35]:
warmup_ratio = 0.06
weight_decay=0.0
gradient_accumulation_steps = 1
num_train_epochs = 5
learning_rate = 1e-05
adam_epsilon = 1e-08
t_total = len(train_loader) // gradient_accumulation_steps * num_train_epochs
optimizer_grouped_parameters = []
custom_parameter_names = set()
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters.extend(
    [
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and not any(nd in n for nd in no_decay)
            ],
            "weight_decay": weight_decay,
        },
        {
            "params": [
                p
                for n, p in model.named_parameters()
                if n not in custom_parameter_names and any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
)

warmup_steps = math.ceil(t_total * warmup_ratio)
optimizer = AdamW(optimizer_grouped_parameters, lr=learning_rate, eps=adam_epsilon)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=t_total)

In [36]:
model.zero_grad()

for epoch in range(num_train_epochs):
    model.train()
    epoch_loss = []
    start_time = time.time()
    
    for batch in train_loader:
        batch = get_inputs_dict(batch)
        input_ids = batch['input_ids'].to(DEVICE)
        attention_mask = batch['attention_mask'].to(DEVICE)
        labels = batch['labels'].to(DEVICE)
        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs[0]
        loss.backward()
        optimizer.step()
        scheduler.step()
        model.zero_grad()
        epoch_loss.append(loss.item())
        
    # evaluate model on test data at the end of the epoch.
    eval_loss = 0.0
    nb_eval_steps = 0
    n_batches = len(test_loader)
    preds = np.empty((len(test_ds), 3))
    out_label_ids = np.empty((len(test_ds)))
    model.eval()
    
    for i, test_batch in enumerate(test_loader):
        with torch.no_grad():
            test_batch = get_inputs_dict(test_batch)
            input_ids = test_batch['input_ids'].to(DEVICE)
            attention_mask = test_batch['attention_mask'].to(DEVICE)
            labels = test_batch['labels'].to(DEVICE)
            outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
            tmp_eval_loss, logits = outputs[:2]
            eval_loss += tmp_eval_loss.item()
            
        nb_eval_steps += 1
        start_index = batch_size * i
        end_index = start_index + batch_size if i != (n_batches - 1) else len(test_ds)
        preds[start_index:end_index] = logits.detach().cpu().numpy()
        out_label_ids[start_index:end_index] = test_batch["labels"].detach().cpu().numpy()
        
    eval_loss = eval_loss / nb_eval_steps
    model_outputs = preds
    preds = np.argmax(preds, axis=1)
    # compute test accuracy
    assert len(preds) == len(out_label_ids)
    mismatched = out_label_ids != preds
    accuracy = sum(mismatched)/len(out_label_ids)
    
    print('epoch ', epoch, 'Training avg loss', np.mean(epoch_loss))
    print('epoch ', epoch, 'Testing  avg loss', eval_loss)
    print('test accuracy = ', accuracy)
    elapsed = time.time() - start_time
    print(model_outputs[0:5])
    print(f'elapsed time = {elapsed:5.2f} seconds.')
    print('---------------------------------------------------\n')

# save the model after training
torch.save(model, 'model.pth')


KeyboardInterrupt: 