In [1]:
!nvidia-smi

Sat Jun 11 01:34:36 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   33C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers
!pip install -q datasets

In [3]:
import os
import sys

import pandas as pd
import numpy as np
import datasets
from datasets import Dataset
from nltk.corpus.reader import ConllCorpusReader

from transformers import AutoModel, AutoTokenizer, AutoModelForMaskedLM
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForLanguageModeling

In [4]:
from nltk.corpus.reader import ConllCorpusReader


train_corpus_reader = ConllCorpusReader('/content/drive/MyDrive/NLP/learning/NLP-K31/dataset/PhoNER_COVID19/data/word/',
                              fileids=['train_word.conll'],
                              columntypes=["words", "pos"])

val_corpus_reader = ConllCorpusReader('/content/drive/MyDrive/NLP/learning/NLP-K31/dataset/PhoNER_COVID19/data/word/',
                              fileids=['dev_word.conll'],
                              columntypes=["words", "pos"])

In [5]:
labels = ['B-PATIENT_ID', 'I-PATIENT_ID',
          'B-NAME', 'I-NAME',
          'B-AGE',
          'B-GENDER',
          'B-JOB', 'I-JOB',
          'B-LOCATION', 'I-LOCATION',
          'B-ORGANIZATION', 'I-ORGANIZATION',
          'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE',
          'B-TRANSPORTATION', 'I-TRANSPORTATION',
          'B-DATE', 'I-DATE',
          'O']

id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
id2label

{0: 'B-PATIENT_ID',
 1: 'I-PATIENT_ID',
 2: 'B-NAME',
 3: 'I-NAME',
 4: 'B-AGE',
 5: 'B-GENDER',
 6: 'B-JOB',
 7: 'I-JOB',
 8: 'B-LOCATION',
 9: 'I-LOCATION',
 10: 'B-ORGANIZATION',
 11: 'I-ORGANIZATION',
 12: 'B-SYMPTOM_AND_DISEASE',
 13: 'I-SYMPTOM_AND_DISEASE',
 14: 'B-TRANSPORTATION',
 15: 'I-TRANSPORTATION',
 16: 'B-DATE',
 17: 'I-DATE',
 18: 'O'}

In [6]:
train_sents = train_corpus_reader.sents()
val_sents = val_corpus_reader.sents()

In [7]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large")
model = AutoModelForMaskedLM.from_pretrained("vinai/phobert-large")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
train_sents[0]

['Đồng_thời',
 ',',
 'bệnh_viện',
 'tiếp_tục',
 'thực_hiện',
 'các',
 'biện_pháp',
 'phòng_chống',
 'dịch_bệnh',
 'COVID',
 '-',
 '19',
 'theo',
 'hướng_dẫn',
 'của',
 'Bộ',
 'Y_tế',
 '.']

In [9]:
tokenizer.decode(tokenizer.encode(train_sents[0]))

'<s> Đồng_thời, bệnh_viện tiếp_tục thực_hiện các biện_pháp phòng_chống dịch_bệnh <unk> - 19 theo hướng_dẫn của Bộ Y_tế. </s>'

In [10]:
tokenizer.encode(train_sents[0])

[0,
 1248,
 4,
 757,
 194,
 112,
 9,
 717,
 2137,
 3795,
 3,
 31,
 1195,
 63,
 1010,
 7,
 125,
 1059,
 5,
 2]

In [11]:
tokenized_dataset_dict = {"input_ids": [], "attention_mask": []}

for sent in (train_sents + val_sents):
    tokenized_dataset_dict["input_ids"].append(tokenizer.encode(sent))
    tokenized_dataset_dict["attention_mask"].append([1 for i in range(len(sent) + 2)])

In [12]:
tokenized_datasets = Dataset.from_dict(tokenized_dataset_dict)
tokenized_datasets

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 7027
})

In [13]:
tokenized_datasets[0]

{'attention_mask': [1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1,
  1],
 'input_ids': [0,
  1248,
  4,
  757,
  194,
  112,
  9,
  717,
  2137,
  3795,
  3,
  31,
  1195,
  63,
  1010,
  7,
  125,
  1059,
  5,
  2]}

In [14]:
block_size = 128

In [15]:
def group_texts(examples):
    # Concatenate all texts.
    concatenated_examples = {k: sum(examples[k], []) for k in examples.keys()}
    total_length = len(concatenated_examples[list(examples.keys())[0]])
    # We drop the small remainder, we could add padding if the model supported it instead of this drop, you can
        # customize this part to your needs.
    total_length = (total_length // block_size) * block_size
    # Split by chunks of max_len.
    result = {
        k: [t[i : i + block_size] for i in range(0, total_length, block_size)]
        for k, t in concatenated_examples.items()
    }
    result["labels"] = result["input_ids"].copy()
    return result

In [16]:
lm_datasets = tokenized_datasets.map(
    group_texts,
    batched=True,
    batch_size=1000,
    num_proc=4,
)

        

#0:   0%|          | 0/2 [00:00<?, ?ba/s]

#1:   0%|          | 0/2 [00:00<?, ?ba/s]

#2:   0%|          | 0/2 [00:00<?, ?ba/s]

#3:   0%|          | 0/2 [00:00<?, ?ba/s]

In [17]:
tokenizer.decode(lm_datasets[1]["input_ids"])

'<s> " Bệnh_nhân 523 " và chồng là " bệnh_nhân 522 ", 67 tuổi, được Bộ Y_tế ghi_nhận nhiễm <unk> hôm 31/7. </s> <s> Trường_hợp bệnh_nhân 188 <unk>, theo thông_tin từ cơ_quan y_tế địa_phương, bệnh_nhân về nhà ngày 14 - 4 và từ đó chỉ tiếp_xúc với chồng và con, đây không phải là <unk> mà do có_thể virus yếu ở thời_điểm lấy mẫu lần trước, hoặc vị_trí lấy mẫu, thời_điểm lấy mẫu dẫn đến âm_tính giả. </s> <s> Riêng bệnh_nhân 91 là phi_công người Anh ngụ ở quận 2, TP. HCM và có liên_quan ổ dịch quán bar <unk>, thông_tin cập_nhật ngày 10 - 4 cho biết diễn_biến bệnh của bệnh_nhân không xấu hơn nhưng cũng'

In [18]:
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm_probability=0.15)

In [19]:
training_args = TrainingArguments(
    output_dir='/content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm',
    overwrite_output_dir=True,
    num_train_epochs=10,
    per_device_train_batch_size=16,
    logging_first_step=True,
    logging_steps=100,
    save_steps=4400,
)

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)

trainer.train()

***** Running training *****
  Num examples = 7027
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 4400


Step,Training Loss
1,2.6554
100,1.9642
200,1.8434
300,1.7273
400,1.7153
500,1.6123
600,1.5569
700,1.5261
800,1.5417
900,1.5826


Saving model checkpoint to /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm/checkpoint-4400
Configuration saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm/checkpoint-4400/config.json
Model weights saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm/checkpoint-4400/pytorch_model.bin


Training completed. Do not forget to share your model on huggingface.co/models =)




TrainOutput(global_step=4400, training_loss=1.251163853244348, metrics={'train_runtime': 1168.0042, 'train_samples_per_second': 60.162, 'train_steps_per_second': 3.767, 'total_flos': 7356086302797966.0, 'train_loss': 1.251163853244348, 'epoch': 10.0})