In [1]:
!nvidia-smi

Sat Jun  4 06:41:22 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   34C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers
!pip install -q datasets
!pip install -q seqeval

In [3]:
import os
import sys

import pandas as pd
import numpy as np
import datasets
from datasets import Dataset
from tqdm.notebook import tqdm
from nltk.corpus.reader import ConllCorpusReader

import transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification

from datasets import Features, Sequence, ClassLabel, Value, DatasetDict
from datasets import load_metric

In [4]:
labels = ['B-PATIENT_ID', 'I-PATIENT_ID',
          'B-NAME', 'I-NAME',
          'B-AGE', 'I-AGE',
          'B-GENDER',
          'B-JOB', 'I-JOB',
          'B-LOCATION', 'I-LOCATION',
          'B-ORGANIZATION', 'I-ORGANIZATION',
          'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE',
          'B-TRANSPORTATION', 'I-TRANSPORTATION',
          'B-DATE', 'I-DATE',
          'O']

id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
id2label

{0: 'B-PATIENT_ID',
 1: 'I-PATIENT_ID',
 2: 'B-NAME',
 3: 'I-NAME',
 4: 'B-AGE',
 5: 'I-AGE',
 6: 'B-GENDER',
 7: 'B-JOB',
 8: 'I-JOB',
 9: 'B-LOCATION',
 10: 'I-LOCATION',
 11: 'B-ORGANIZATION',
 12: 'I-ORGANIZATION',
 13: 'B-SYMPTOM_AND_DISEASE',
 14: 'I-SYMPTOM_AND_DISEASE',
 15: 'B-TRANSPORTATION',
 16: 'I-TRANSPORTATION',
 17: 'B-DATE',
 18: 'I-DATE',
 19: 'O'}

In [5]:
len(labels)

20

In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-large")

def get_dict_datasets(tagged_sents):
    dict_datasets = {"texts": [], "input_ids": [], "attention_mask": [], "ner_tags": []}
    
    for tagged_sent in tqdm(tagged_sents):
        input_ids = []
        attention_mask = []
        ner_tags = []
        for tagged_word in tagged_sent:
            tokenized_word_input = tokenizer(tagged_word[0], add_special_tokens=False)
            input_ids.extend(tokenized_word_input["input_ids"])
            attention_mask.extend(tokenized_word_input["attention_mask"])
            ner_tags.extend([label2id[tagged_word[1]] for i in range(len(tokenized_word_input["input_ids"]))])
        
        input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
        attention_mask = [1] + attention_mask + [1]
        ner_tags = [-100] + ner_tags + [-100]

        assert len(input_ids) == len(attention_mask)
        assert len(input_ids) == len(ner_tags)

        dict_datasets["texts"].append([tagged_word[0] for tagged_word in tagged_sent])
        dict_datasets["input_ids"].append(input_ids)
        dict_datasets["attention_mask"].append(attention_mask)
        dict_datasets["ner_tags"].append(ner_tags)

    return dict_datasets

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def get_datasets(root='/content/drive/MyDrive/NLP/learning/NLP-K31/dataset/PhoNER_COVID19/data/word/'):
    
    train_corpus_reader = ConllCorpusReader(root=root, fileids=['train_word.conll'], columntypes=["words", "pos"])
    val_corpus_reader = ConllCorpusReader(root=root, fileids=['dev_word.conll'], columntypes=["words", "pos"])
    test_corpus_reader = ConllCorpusReader(root=root, fileids=['test_word.conll'], columntypes=["words", "pos"])

    train_tagged_sents = train_corpus_reader.tagged_sents()
    val_tagged_sents = val_corpus_reader.tagged_sents()
    test_tagged_sents = test_corpus_reader.tagged_sents()

    dict_train_datasets = get_dict_datasets(train_tagged_sents)
    dict_val_datasets = get_dict_datasets(val_tagged_sents)
    dict_test_datasets = get_dict_datasets(test_tagged_sents)

    train_datasets = Dataset.from_dict(dict_train_datasets)
    val_datasets = Dataset.from_dict(dict_val_datasets)
    test_datasets = Dataset.from_dict(dict_test_datasets)

    datasets = DatasetDict({"train": train_datasets, "val": val_datasets, "test": test_datasets})

    features = Features({
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'ner_tags': Sequence(ClassLabel(names=labels))
    })

    datasets = datasets.map(remove_columns=['texts'], features=features)
    
    return datasets

In [8]:
datasets = get_datasets()
datasets = datasets.rename_column("ner_tags", "labels")

  0%|          | 0/5027 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/5027 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5027
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [10]:
label_list = datasets["train"].features["labels"].feature.names
label_list

['B-PATIENT_ID',
 'I-PATIENT_ID',
 'B-NAME',
 'I-NAME',
 'B-AGE',
 'I-AGE',
 'B-GENDER',
 'B-JOB',
 'I-JOB',
 'B-LOCATION',
 'I-LOCATION',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-SYMPTOM_AND_DISEASE',
 'I-SYMPTOM_AND_DISEASE',
 'B-TRANSPORTATION',
 'I-TRANSPORTATION',
 'B-DATE',
 'I-DATE',
 'O']

In [11]:
print(datasets["train"][0]["input_ids"])

[0, 1248, 4, 757, 194, 112, 9, 717, 2137, 3795, 9089, 6232, 1927, 31, 1195, 63, 1010, 7, 125, 1059, 5, 2]


In [12]:
print(datasets["train"][0]["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
print(datasets["train"][0]["labels"])

[-100, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 12, 19, -100]


In [14]:
model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-large", num_labels=len(label_list))
model

Downloading:   0%|          | 0.00/1.38G [00:00<?, ?B/s]

Some weights of the model checkpoint at vinai/phobert-large were not used when initializing RobertaForTokenClassification: ['lm_head.decoder.weight', 'lm_head.dense.bias', 'lm_head.decoder.bias', 'lm_head.bias', 'lm_head.dense.weight', 'lm_head.layer_norm.bias', 'lm_head.layer_norm.weight']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-large and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this 

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 1024, padding_idx=1)
      (position_embeddings): Embedding(258, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=True)
            

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
metric = load_metric("seqeval")
metric.compute(predictions=[labels], references=[labels])

Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

{'AGE': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'DATE': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'GENDER': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'JOB': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'LOCATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'NAME': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'ORGANIZATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'PATIENT_ID': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'SYMPTOM_AND_DISEASE': {'f1': 1.0,
  'number': 1,
  'precision': 1.0,
  'recall': 1.0},
 'TRANSPORTATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [17]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [18]:
args = TrainingArguments(
    "/content/test",
    overwrite_output_dir=True,
    num_train_epochs=10,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    weight_decay=0.01,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=datasets['train'],
    eval_dataset=datasets['val'],
    compute_metrics=compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5027
  Num Epochs = 10
  Instantaneous batch size per device = 16
  Total train batch size (w. parallel, distributed & accumulation) = 16
  Gradient Accumulation steps = 1
  Total optimization steps = 3150


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.100958,0.888902,0.921888,0.905095,0.973367
2,0.233400,0.080537,0.934247,0.945614,0.939896,0.980733
3,0.233400,0.074852,0.947643,0.949142,0.948392,0.98166
4,0.033200,0.086462,0.950846,0.943789,0.947304,0.981323
5,0.018000,0.083993,0.946242,0.957294,0.951736,0.982183
6,0.018000,0.091611,0.950176,0.953644,0.951907,0.981795
7,0.011600,0.098956,0.952154,0.949142,0.950646,0.981913
8,0.007400,0.10491,0.95187,0.950481,0.951175,0.981627
9,0.007400,0.105044,0.952566,0.955347,0.953955,0.98252
10,0.004600,0.106456,0.952554,0.955104,0.953827,0.982419


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to /content/test/checkpoint-315
Configuration saved in /content/test/checkpoint-315/config.json
Model weights saved in /content/test/checkpoint-315/pytorch_model.bin
tokenizer config file saved in /content/test/checkpoint-315/tokenizer_config.json
Special tokens file saved in /content/test/checkpoint-315/special_tokens_map.json
added tokens file saved in /content/test/checkpoint-315/added_tokens.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16
Saving model checkpoint to /content/test/checkpoint-630
Configuration saved in /content/test/checkpoint-630/config.json
Model weights saved in /content/test/checkpoint-630/pytorch_model.bin
tokenizer config file saved in /content/test/checkpoint-630/tokenizer_config.json
Special tokens file saved in /content/test/checkpoint-630/special_tokens_map.json
added tokens file saved in /content/test/checkpoint-630/added_tokens.json
**

TrainOutput(global_step=3150, training_loss=0.04916271870098417, metrics={'train_runtime': 1034.0315, 'train_samples_per_second': 48.616, 'train_steps_per_second': 3.046, 'total_flos': 5391012353562864.0, 'train_loss': 0.04916271870098417, 'epoch': 10.0})

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 16


{'epoch': 10.0,
 'eval_accuracy': 0.9825200168563001,
 'eval_f1': 0.9539545620216255,
 'eval_loss': 0.10504396259784698,
 'eval_precision': 0.9525658134174451,
 'eval_recall': 0.9553473658595937,
 'eval_runtime': 9.8535,
 'eval_samples_per_second': 202.974,
 'eval_steps_per_second': 12.686}

In [21]:
predictions, labels, _ = trainer.predict(datasets['test'])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results= metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 3000
  Batch size = 16


{'AGE': {'f1': 0.9613733905579399,
  'number': 584,
  'precision': 0.963855421686747,
  'recall': 0.958904109589041},
 'DATE': {'f1': 0.9888520638746611,
  'number': 1655,
  'precision': 0.9861778846153846,
  'recall': 0.9915407854984895},
 'GENDER': {'f1': 0.9807497467071936,
  'number': 494,
  'precision': 0.9817444219066938,
  'recall': 0.979757085020243},
 'JOB': {'f1': 0.7999999999999999,
  'number': 174,
  'precision': 0.7845303867403315,
  'recall': 0.8160919540229885},
 'LOCATION': {'f1': 0.9405776023270309,
  'number': 4788,
  'precision': 0.9357172385283175,
  'recall': 0.9454887218045113},
 'NAME': {'f1': 0.9579554822753503,
  'number': 605,
  'precision': 0.9555921052631579,
  'recall': 0.9603305785123967},
 'ORGANIZATION': {'f1': 0.8930817610062892,
  'number': 800,
  'precision': 0.8987341772151899,
  'recall': 0.8875},
 'PATIENT_ID': {'f1': 0.9821798590965602,
  'number': 2401,
  'precision': 0.977319587628866,
  'recall': 0.9870887130362349},
 'SYMPTOM_AND_DISEASE': {'f

In [22]:
from seqeval.metrics import classification_report

print(classification_report(true_predictions, true_labels, digits=3))

                     precision    recall  f1-score   support

                AGE      0.959     0.964     0.961       581
               DATE      0.992     0.986     0.989      1664
             GENDER      0.980     0.982     0.981       493
                JOB      0.816     0.785     0.800       181
           LOCATION      0.945     0.936     0.941      4838
               NAME      0.960     0.956     0.958       608
       ORGANIZATION      0.887     0.899     0.893       790
         PATIENT_ID      0.987     0.977     0.982      2425
SYMPTOM_AND_DISEASE      0.856     0.877     0.866      1136
     TRANSPORTATION      0.984     0.977     0.980       432

          micro avg      0.950     0.946     0.948     13148
          macro avg      0.937     0.934     0.935     13148
       weighted avg      0.950     0.946     0.948     13148

