In [21]:
import json
import torch
from torch.utils.data import DataLoader, Dataset, random_split
from torch import Generator

In [9]:
from os.path import expanduser
home = expanduser("~")
path = home + "/vlp/dat/nli/XNLI-1.0/vi.tok.jsonl"
print(path)

/Users/phuonglh/vlp/dat/nli/XNLI-1.0/vi.tok.jsonl


In [58]:
class VietnameseXNLI(Dataset):
  def __init__(self, jsonlPath):
    self.X = []
    self.y = []
    with open(jsonlPath) as f:
      for line in f:
        sample = json.loads(line)
        self.X.append(sample["sentence1_tokenized"] + " </s> " + sample["sentence2_tokenized"])
        self.y.append(sample["gold_label"])
  
  def __len__(self):
    return len(self.y)
  
  def __getitem__(self, i):
    return self.X[i], self.y[i]


In [59]:
dataset = VietnameseXNLI(path)
N = int(0.8*len(dataset))
training, test = random_split(dataset, [N, len(dataset)-N], generator=Generator().manual_seed(12345))

In [60]:
train_loader = DataLoader(training, batch_size=32)
test_loader = DataLoader(test, batch_size=32)

In [61]:
next(iter(test_loader))

[('ừm không có gì sai với việc một phụ_huynh tặng tất_cả mọi thứ cô ấy có để ừ cho ừ cho một cá_nhân như ừ như bạn </s> Bố_mẹ có_thể cho con_cái nhiều quà_cáp .',
  'Ngày_nay , những người Đức này thậm_chí không ở lại Hoa_Kỳ . </s> Những người Đức này được sử_dụng trên khắp châu Mỹ ngày này .',
  'tôi sợ là , tôi nghĩ rằng tên anh ấy_là Anderson , một quý ông đang tìm_kiếm một chiếc vé độc_lập chống lại Reagan </s> Anderson đánh_bại Reagan .',
  'Nếu có_khi nào tôi viết một cuốn tự_truyện , nó sẽ là từ_điển của những địa_danh và con_người được định_nghĩa theo tầm quan_trọng riêng . </s> Điều quan_trọng với tôi là tự_truyện của tôi có_thể được tiếp_cận dễ_dàng nhất .',
  'Trong xe_ngựa kéo , bạn ghé thăm dân_làng đang thu_hoạch , cắt lông cừu , mài bột trong nhà_máy , dệt , và làmt móng ngựa . </s> Người_dân trong làng ăn_mặc như thời thuộc_địa .',
  'Những người không giao_tiếp với các ngôn_ngữ liên_quan có_lẽ sẽ không có câu trả_lời cho những câu_hỏi tu_từ này , nhưng tôi cảm_thấy chắ

In [63]:
next(iter(test_loader))[0]

('ừm không có gì sai với việc một phụ_huynh tặng tất_cả mọi thứ cô ấy có để ừ cho ừ cho một cá_nhân như ừ như bạn </s> Bố_mẹ có_thể cho con_cái nhiều quà_cáp .',
 'Ngày_nay , những người Đức này thậm_chí không ở lại Hoa_Kỳ . </s> Những người Đức này được sử_dụng trên khắp châu Mỹ ngày này .',
 'tôi sợ là , tôi nghĩ rằng tên anh ấy_là Anderson , một quý ông đang tìm_kiếm một chiếc vé độc_lập chống lại Reagan </s> Anderson đánh_bại Reagan .',
 'Nếu có_khi nào tôi viết một cuốn tự_truyện , nó sẽ là từ_điển của những địa_danh và con_người được định_nghĩa theo tầm quan_trọng riêng . </s> Điều quan_trọng với tôi là tự_truyện của tôi có_thể được tiếp_cận dễ_dàng nhất .',
 'Trong xe_ngựa kéo , bạn ghé thăm dân_làng đang thu_hoạch , cắt lông cừu , mài bột trong nhà_máy , dệt , và làmt móng ngựa . </s> Người_dân trong làng ăn_mặc như thời thuộc_địa .',
 'Những người không giao_tiếp với các ngôn_ngữ liên_quan có_lẽ sẽ không có câu trả_lời cho những câu_hỏi tu_từ này , nhưng tôi cảm_thấy chắc_chắn

In [23]:
from transformers import AutoModel, AutoTokenizer

In [24]:
bert = AutoModel.from_pretrained("vinai/phobert-base")
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaModel: ['lm_head.bias', 'lm_head.decoder.bias', 'lm_head.layer_norm.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.dense.weight', 'lm_head.decoder.weight']
- This IS expected if you are initializing RobertaModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [52]:
tokenizer

PreTrainedTokenizer(name_or_path='vinai/phobert-base', vocab_size=64000, model_max_len=256, is_fast=False, padding_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '</s>', 'unk_token': '<unk>', 'sep_token': '</s>', 'pad_token': '<pad>', 'cls_token': '<s>', 'mask_token': '<mask>'})

In [25]:
labels = {"entailment": 0, "neutral": 1, "contradiction": 2}

In [72]:
def tensorify(batch):
    xs = []
    ys = []
    for x, y in zip(batch[0], batch[1]):
      u = torch.tensor([tokenizer.encode(x)])
      v = torch.tensor([labels[y]])
      xs.append(u)
      ys.append(v)
    return (xs, ys)

In [22]:
import torch.nn as nn
DEVICE = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

In [74]:
tensorify(next(iter(test_loader)))

([tensor([[    0, 14583,   599,    17,    10,   148,  1069,    15,    49,    16,
            1635,   806,   392,   207,   129,   106,   241,    10,    24,  6236,
              13,  6236,    13,    16,   435,    42,  6236,    42,    88,     2,
            7026,    62,    13,  2974,    36, 32780,     5,     2]]),
  tensor([[   0, 6177,    4,   21,   18,  357,   23,  644,   17,   25,   44, 1428,
              5,    2,  217,   18,  357,   23,   11,  117,   34, 1270, 2935,   93,
             43,   23,    5,    2]]),
  tensor([[    0,    70,  1080,     8,     4,    70,   487,    87,   221,    83,
           25190,     8, 13641,     4,    16,   882,    46,    52,   885,    16,
             152,   826,  1912,   335,    44, 39336,     2, 13641,  2424, 39336,
               5,     2]]),
  tensor([[    0,   313,  4012,   142,    70,   467,    16,  1088, 13811,     4,
             231,    38,     8, 16231,     7,    21,  7723,     6,   754,    11,
            8409,    63,   748,   331,   465,     