In [1]:
!nvidia-smi

Sat Jun 11 01:56:32 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   32C    P0    24W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers
!pip install -q datasets
!pip install -q seqeval

[K     |████████████████████████████████| 4.2 MB 15.4 MB/s 
[K     |████████████████████████████████| 596 kB 66.8 MB/s 
[K     |████████████████████████████████| 6.6 MB 49.8 MB/s 
[K     |████████████████████████████████| 86 kB 6.2 MB/s 
[K     |████████████████████████████████| 346 kB 14.5 MB/s 
[K     |████████████████████████████████| 1.1 MB 66.4 MB/s 
[K     |████████████████████████████████| 140 kB 61.5 MB/s 
[K     |████████████████████████████████| 212 kB 77.8 MB/s 
[K     |████████████████████████████████| 86 kB 7.5 MB/s 
[K     |████████████████████████████████| 127 kB 74.3 MB/s 
[K     |████████████████████████████████| 94 kB 4.2 MB/s 
[K     |████████████████████████████████| 271 kB 78.9 MB/s 
[K     |████████████████████████████████| 144 kB 74.3 MB/s 
[K     |████████████████████████████████| 112 kB 76.8 MB/s 
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the foll

In [3]:
import os
import sys

import pandas as pd
import numpy as np
import datasets
from datasets import Dataset
from tqdm.notebook import tqdm
from nltk.corpus.reader import ConllCorpusReader

import transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification

from datasets import Features, Sequence, ClassLabel, Value, DatasetDict
from datasets import load_metric

In [4]:
labels = ['B-PATIENT_ID', 'I-PATIENT_ID',
          'B-NAME', 'I-NAME',
          'B-AGE', 'I-AGE',
          'B-GENDER',
          'B-JOB', 'I-JOB',
          'B-LOCATION', 'I-LOCATION',
          'B-ORGANIZATION', 'I-ORGANIZATION',
          'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE',
          'B-TRANSPORTATION', 'I-TRANSPORTATION',
          'B-DATE', 'I-DATE',
          'O']

id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
id2label

{0: 'B-PATIENT_ID',
 1: 'I-PATIENT_ID',
 2: 'B-NAME',
 3: 'I-NAME',
 4: 'B-AGE',
 5: 'I-AGE',
 6: 'B-GENDER',
 7: 'B-JOB',
 8: 'I-JOB',
 9: 'B-LOCATION',
 10: 'I-LOCATION',
 11: 'B-ORGANIZATION',
 12: 'I-ORGANIZATION',
 13: 'B-SYMPTOM_AND_DISEASE',
 14: 'I-SYMPTOM_AND_DISEASE',
 15: 'B-TRANSPORTATION',
 16: 'I-TRANSPORTATION',
 17: 'B-DATE',
 18: 'I-DATE',
 19: 'O'}

In [5]:
len(labels)

20

In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large")

def get_dict_datasets(tagged_sents):
    dict_datasets = {"texts": [], "input_ids": [], "attention_mask": [], "ner_tags": []}
    
    for tagged_sent in tqdm(tagged_sents):
        input_ids = []
        attention_mask = []
        ner_tags = []
        for tagged_word in tagged_sent:
            tokenized_word_input = tokenizer(tagged_word[0], add_special_tokens=False)
            input_ids.extend(tokenized_word_input["input_ids"])
            attention_mask.extend(tokenized_word_input["attention_mask"])
            ner_tags.extend([label2id[tagged_word[1]] for i in range(len(tokenized_word_input["input_ids"]))])
        
        input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
        attention_mask = [1] + attention_mask + [1]
        ner_tags = [-100] + ner_tags + [-100]

        assert len(input_ids) == len(attention_mask)
        assert len(input_ids) == len(ner_tags)

        dict_datasets["texts"].append([tagged_word[0] for tagged_word in tagged_sent])
        dict_datasets["input_ids"].append(input_ids)
        dict_datasets["attention_mask"].append(attention_mask)
        dict_datasets["ner_tags"].append(ner_tags)

    return dict_datasets

Downloading:   0%|          | 0.00/558 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/874k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.08M [00:00<?, ?B/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def get_datasets(root='/content/drive/MyDrive/NLP/learning/NLP-K31/dataset/PhoNER_COVID19/data/word/'):
    
    train_corpus_reader = ConllCorpusReader(root=root, fileids=['train_word.conll'], columntypes=["words", "pos"])
    val_corpus_reader = ConllCorpusReader(root=root, fileids=['dev_word.conll'], columntypes=["words", "pos"])
    test_corpus_reader = ConllCorpusReader(root=root, fileids=['test_word.conll'], columntypes=["words", "pos"])

    train_tagged_sents = train_corpus_reader.tagged_sents()
    val_tagged_sents = val_corpus_reader.tagged_sents()
    test_tagged_sents = test_corpus_reader.tagged_sents()

    dict_train_datasets = get_dict_datasets(train_tagged_sents)
    dict_val_datasets = get_dict_datasets(val_tagged_sents)
    dict_test_datasets = get_dict_datasets(test_tagged_sents)

    train_datasets = Dataset.from_dict(dict_train_datasets)
    val_datasets = Dataset.from_dict(dict_val_datasets)
    test_datasets = Dataset.from_dict(dict_test_datasets)

    datasets = DatasetDict({"train": train_datasets, "val": val_datasets, "test": test_datasets})

    features = Features({
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'ner_tags': Sequence(ClassLabel(names=labels))
    })

    datasets = datasets.map(remove_columns=['texts'], features=features)
    
    return datasets

In [8]:
datasets = get_datasets()
datasets = datasets.rename_column("ner_tags", "labels")

  0%|          | 0/5027 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/5027 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5027
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [10]:
label_list = datasets["train"].features["labels"].feature.names
label_list

['B-PATIENT_ID',
 'I-PATIENT_ID',
 'B-NAME',
 'I-NAME',
 'B-AGE',
 'I-AGE',
 'B-GENDER',
 'B-JOB',
 'I-JOB',
 'B-LOCATION',
 'I-LOCATION',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-SYMPTOM_AND_DISEASE',
 'I-SYMPTOM_AND_DISEASE',
 'B-TRANSPORTATION',
 'I-TRANSPORTATION',
 'B-DATE',
 'I-DATE',
 'O']

In [11]:
print(datasets["train"][0]["input_ids"])

[0, 1248, 4, 757, 194, 112, 9, 717, 2137, 3795, 9089, 6232, 1927, 31, 1195, 63, 1010, 7, 125, 1059, 5, 2]


In [12]:
print(datasets["train"][0]["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
print(datasets["train"][0]["labels"])

[-100, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 12, 19, -100]


In [14]:
model = AutoModelForTokenClassification.from_pretrained("/content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm/checkpoint-4400", num_labels=len(label_list))
model

Some weights of the model checkpoint at /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm/checkpoint-4400 were not used when initializing RobertaForTokenClassification: ['lm_head.dense.weight', 'lm_head.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.dense.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm/checkpoint-4400 and are new

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 1024, padding_idx=1)
      (position_embeddings): Embedding(258, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
            

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
metric = load_metric("seqeval")
metric.compute(predictions=[labels], references=[labels])

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

{'AGE': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'DATE': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'GENDER': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'JOB': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'LOCATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'NAME': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'ORGANIZATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'PATIENT_ID': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'SYMPTOM_AND_DISEASE': {'f1': 1.0,
  'number': 1,
  'precision': 1.0,
  'recall': 1.0},
 'TRANSPORTATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [17]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [18]:
args = TrainingArguments(
    "/content/test",
    overwrite_output_dir=True,
    num_train_epochs=10,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=datasets['train'],
    eval_dataset=datasets['val'],
    compute_metrics=compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5027
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3150


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.109506,0.897136,0.91471,0.905838,0.972575
2,0.254100,0.079681,0.935944,0.942207,0.939065,0.980177
3,0.254100,0.087205,0.943208,0.943667,0.943438,0.980143
4,0.033700,0.08776,0.945956,0.939165,0.942548,0.980379
5,0.017900,0.09154,0.94897,0.952549,0.950756,0.982014
6,0.017900,0.098439,0.948151,0.954496,0.951313,0.982301
7,0.010700,0.100405,0.946872,0.951941,0.949399,0.981761
8,0.007300,0.099981,0.949945,0.953644,0.951791,0.982351
9,0.007300,0.105951,0.949229,0.950846,0.950036,0.98225
10,0.005000,0.107279,0.948522,0.952792,0.950653,0.982385


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to /content/test/checkpoint-315
Configuration saved in /content/test/checkpoint-315/config.json
Model weights saved in /content/test/checkpoint-315/pytorch_model.bin
tokenizer config file saved in /content/test/checkpoint-315/tokenizer_config.json
Special tokens file saved in /content/test/checkpoint-315/special_tokens_map.json
added tokens file saved in /content/test/checkpoint-315/added_tokens.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to /content/test/checkpoint-630
Configuration saved in /content/test/checkpoint-630/config.json
Model weights saved in /content/test/checkpoint-630/pytorch_model.bin
tokenizer config file saved in /content/test/checkpoint-630/tokenizer_config.json
Special tokens file saved in /content/test/checkpoint-630/special_tokens_map.json
added tokens file saved in /content/test/checkpoint-630/added_tokens.json
**

TrainOutput(global_step=3150, training_loss=0.05244968768150087, metrics={'train_runtime': 1018.6053, 'train_samples_per_second': 49.352, 'train_steps_per_second': 3.092, 'total_flos': 5391012353562864.0, 'train_loss': 0.05244968768150087, 'epoch': 10.0})

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'epoch': 10.0,
 'eval_accuracy': 0.9823514538558786,
 'eval_f1': 0.9517911353976928,
 'eval_loss': 0.09998088330030441,
 'eval_precision': 0.9499454611562235,
 'eval_recall': 0.9536439956199051,
 'eval_runtime': 11.2047,
 'eval_samples_per_second': 178.496,
 'eval_steps_per_second': 11.156}

In [21]:
predictions, labels, _ = trainer.predict(datasets['test'])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results= metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


{'AGE': {'f1': 0.9648068669527897,
  'number': 584,
  'precision': 0.9672977624784854,
  'recall': 0.9623287671232876},
 'DATE': {'f1': 0.9867708959711364,
  'number': 1655,
  'precision': 0.9820466786355476,
  'recall': 0.9915407854984895},
 'GENDER': {'f1': 0.9723643807574207,
  'number': 494,
  'precision': 0.9834368530020704,
  'recall': 0.9615384615384616},
 'JOB': {'f1': 0.8137535816618912,
  'number': 174,
  'precision': 0.8114285714285714,
  'recall': 0.8160919540229885},
 'LOCATION': {'f1': 0.9342379958246346,
  'number': 4788,
  'precision': 0.9338480801335559,
  'recall': 0.9346282372598163},
 'NAME': {'f1': 0.9638752052545156,
  'number': 605,
  'precision': 0.9575856443719413,
  'recall': 0.9702479338842975},
 'ORGANIZATION': {'f1': 0.886945658963148,
  'number': 800,
  'precision': 0.8863920099875156,
  'recall': 0.8875},
 'PATIENT_ID': {'f1': 0.9834231247409865,
  'number': 2401,
  'precision': 0.9785567010309278,
  'recall': 0.9883381924198251},
 'SYMPTOM_AND_DISEASE': 

In [22]:
from seqeval.metrics import classification_report

print(classification_report(true_predictions, true_labels, digits=3))

                     precision    recall  f1-score   support

                AGE      0.962     0.967     0.965       581
               DATE      0.992     0.982     0.987      1671
             GENDER      0.962     0.983     0.972       483
                JOB      0.816     0.811     0.814       175
           LOCATION      0.935     0.934     0.934      4792
               NAME      0.970     0.958     0.964       613
       ORGANIZATION      0.887     0.886     0.887       801
         PATIENT_ID      0.988     0.979     0.983      2425
SYMPTOM_AND_DISEASE      0.858     0.889     0.873      1124
     TRANSPORTATION      0.988     0.979     0.984       433

          micro avg      0.946     0.946     0.946     13098
          macro avg      0.936     0.937     0.936     13098
       weighted avg      0.946     0.946     0.946     13098



In [23]:
trainer.save_model('/content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task')

Saving model checkpoint to /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task
Configuration saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task/config.json
Model weights saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task/pytorch_model.bin
tokenizer config file saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task/tokenizer_config.json
Special tokens file saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task/special_tokens_map.json
added tokens file saved in /content/drive/MyDrive/NLP/learning/NLP-K31/pretrained/phoBert_large_mlm_task/added_tokens.json
