In [1]:
!nvidia-smi

Sat Jun  4 07:47:15 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 460.32.03    Driver Version: 460.32.03    CUDA Version: 11.2     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla V100-SXM2...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   35C    P0    23W / 300W |      0MiB / 16160MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Proces

In [2]:
!pip install -q transformers
!pip install -q datasets
!pip install -q seqeval

In [3]:
import os
import sys

import pandas as pd
import numpy as np
import datasets
from datasets import Dataset
from tqdm.notebook import tqdm
from nltk.corpus.reader import ConllCorpusReader

import transformers
from transformers import AutoModelForTokenClassification, AutoTokenizer
from transformers import Trainer, TrainingArguments
from transformers import DataCollatorForTokenClassification

from datasets import Features, Sequence, ClassLabel, Value, DatasetDict
from datasets import load_metric

In [4]:
labels = ['B-PATIENT_ID', 'I-PATIENT_ID',
          'B-NAME', 'I-NAME',
          'B-AGE', 'I-AGE',
          'B-GENDER',
          'B-JOB', 'I-JOB',
          'B-LOCATION', 'I-LOCATION',
          'B-ORGANIZATION', 'I-ORGANIZATION',
          'B-SYMPTOM_AND_DISEASE', 'I-SYMPTOM_AND_DISEASE',
          'B-TRANSPORTATION', 'I-TRANSPORTATION',
          'B-DATE', 'I-DATE',
          'O']

id2label = {v: k for v, k in enumerate(labels)}
label2id = {k: v for v, k in enumerate(labels)}
id2label

{0: 'B-PATIENT_ID',
 1: 'I-PATIENT_ID',
 2: 'B-NAME',
 3: 'I-NAME',
 4: 'B-AGE',
 5: 'I-AGE',
 6: 'B-GENDER',
 7: 'B-JOB',
 8: 'I-JOB',
 9: 'B-LOCATION',
 10: 'I-LOCATION',
 11: 'B-ORGANIZATION',
 12: 'I-ORGANIZATION',
 13: 'B-SYMPTOM_AND_DISEASE',
 14: 'I-SYMPTOM_AND_DISEASE',
 15: 'B-TRANSPORTATION',
 16: 'I-TRANSPORTATION',
 17: 'B-DATE',
 18: 'I-DATE',
 19: 'O'}

In [5]:
len(labels)

20

In [6]:
tokenizer = AutoTokenizer.from_pretrained("vinai/phobert-base")

def get_dict_datasets(tagged_sents):
    dict_datasets = {"texts": [], "input_ids": [], "attention_mask": [], "ner_tags": []}
    
    for tagged_sent in tqdm(tagged_sents):
        input_ids = []
        attention_mask = []
        ner_tags = []
        for tagged_word in tagged_sent:
            tokenized_word_input = tokenizer(tagged_word[0], add_special_tokens=False)
            input_ids.extend(tokenized_word_input["input_ids"])
            attention_mask.extend(tokenized_word_input["attention_mask"])
            ner_tags.extend([label2id[tagged_word[1]] for i in range(len(tokenized_word_input["input_ids"]))])
        
        input_ids = [tokenizer.bos_token_id] + input_ids + [tokenizer.eos_token_id]
        attention_mask = [1] + attention_mask + [1]
        ner_tags = [-100] + ner_tags + [-100]

        assert len(input_ids) == len(attention_mask)
        assert len(input_ids) == len(ner_tags)

        dict_datasets["texts"].append([tagged_word[0] for tagged_word in tagged_sent])
        dict_datasets["input_ids"].append(input_ids)
        dict_datasets["attention_mask"].append(attention_mask)
        dict_datasets["ner_tags"].append(ner_tags)

    return dict_datasets

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [7]:
def get_datasets(root='/content/drive/MyDrive/NLP/learning/NLP-K31/dataset/PhoNER_COVID19/data/word/'):
    
    train_corpus_reader = ConllCorpusReader(root=root, fileids=['train_word.conll'], columntypes=["words", "pos"])
    val_corpus_reader = ConllCorpusReader(root=root, fileids=['dev_word.conll'], columntypes=["words", "pos"])
    test_corpus_reader = ConllCorpusReader(root=root, fileids=['test_word.conll'], columntypes=["words", "pos"])

    train_tagged_sents = train_corpus_reader.tagged_sents()
    val_tagged_sents = val_corpus_reader.tagged_sents()
    test_tagged_sents = test_corpus_reader.tagged_sents()

    dict_train_datasets = get_dict_datasets(train_tagged_sents)
    dict_val_datasets = get_dict_datasets(val_tagged_sents)
    dict_test_datasets = get_dict_datasets(test_tagged_sents)

    train_datasets = Dataset.from_dict(dict_train_datasets)
    val_datasets = Dataset.from_dict(dict_val_datasets)
    test_datasets = Dataset.from_dict(dict_test_datasets)

    datasets = DatasetDict({"train": train_datasets, "val": val_datasets, "test": test_datasets})

    features = Features({
    'input_ids': Sequence(Value(dtype='int64')),
    'attention_mask': Sequence(Value(dtype='int64')),
    'ner_tags': Sequence(ClassLabel(names=labels))
    })

    datasets = datasets.map(remove_columns=['texts'], features=features)
    
    return datasets

In [8]:
datasets = get_datasets()
datasets = datasets.rename_column("ner_tags", "labels")

  0%|          | 0/5027 [00:00<?, ?it/s]

  0%|          | 0/2000 [00:00<?, ?it/s]

  0%|          | 0/3000 [00:00<?, ?it/s]

  0%|          | 0/5027 [00:00<?, ?ex/s]

  0%|          | 0/2000 [00:00<?, ?ex/s]

  0%|          | 0/3000 [00:00<?, ?ex/s]

In [9]:
datasets

DatasetDict({
    train: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 5027
    })
    val: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 2000
    })
    test: Dataset({
        features: ['input_ids', 'attention_mask', 'labels'],
        num_rows: 3000
    })
})

In [10]:
label_list = datasets["train"].features["labels"].feature.names
label_list

['B-PATIENT_ID',
 'I-PATIENT_ID',
 'B-NAME',
 'I-NAME',
 'B-AGE',
 'I-AGE',
 'B-GENDER',
 'B-JOB',
 'I-JOB',
 'B-LOCATION',
 'I-LOCATION',
 'B-ORGANIZATION',
 'I-ORGANIZATION',
 'B-SYMPTOM_AND_DISEASE',
 'I-SYMPTOM_AND_DISEASE',
 'B-TRANSPORTATION',
 'I-TRANSPORTATION',
 'B-DATE',
 'I-DATE',
 'O']

In [11]:
print(datasets["train"][0]["input_ids"])

[0, 1248, 4, 757, 194, 112, 9, 717, 2137, 3795, 9089, 6232, 1927, 31, 1195, 63, 1010, 7, 125, 1059, 5, 2]


In [12]:
print(datasets["train"][0]["attention_mask"])

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]


In [13]:
print(datasets["train"][0]["labels"])

[-100, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 19, 11, 12, 19, -100]


In [14]:
model = AutoModelForTokenClassification.from_pretrained("vinai/phobert-base", num_labels=len(label_list))
model

Some weights of the model checkpoint at vinai/phobert-base were not used when initializing RobertaForTokenClassification: ['lm_head.layer_norm.weight', 'lm_head.decoder.weight', 'lm_head.dense.weight', 'lm_head.bias', 'lm_head.decoder.bias', 'lm_head.dense.bias', 'lm_head.layer_norm.bias']
- This IS expected if you are initializing RobertaForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of RobertaForTokenClassification were not initialized from the model checkpoint at vinai/phobert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this mo

RobertaForTokenClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(64001, 768, padding_idx=1)
      (position_embeddings): Embedding(258, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0): RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm

In [15]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [16]:
metric = load_metric("seqeval")
metric.compute(predictions=[labels], references=[labels])

{'AGE': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'DATE': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'GENDER': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'JOB': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'LOCATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'NAME': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'ORGANIZATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'PATIENT_ID': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'SYMPTOM_AND_DISEASE': {'f1': 1.0,
  'number': 1,
  'precision': 1.0,
  'recall': 1.0},
 'TRANSPORTATION': {'f1': 1.0, 'number': 1, 'precision': 1.0, 'recall': 1.0},
 'overall_accuracy': 1.0,
 'overall_f1': 1.0,
 'overall_precision': 1.0,
 'overall_recall': 1.0}

In [17]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"]
    }

In [18]:
args = TrainingArguments(
    "/content/test",
    overwrite_output_dir=True,
    num_train_epochs=15,
    evaluation_strategy = "epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=64,
    per_device_eval_batch_size=64,
    weight_decay=0.01,
    logging_steps=100,
    metric_for_best_model="f1",
    load_best_model_at_end=True,
)

In [19]:
trainer = Trainer(
    model=model,
    args=args,
    data_collator=data_collator,
    tokenizer=tokenizer,
    train_dataset=datasets['train'],
    eval_dataset=datasets['val'],
    compute_metrics=compute_metrics
)

trainer.train()

***** Running training *****
  Num examples = 5027
  Num Epochs = 15
  Instantaneous batch size per device = 64
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 1
  Total optimization steps = 1185


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,No log,0.174977,0.874686,0.888308,0.881444,0.966085
2,0.481600,0.109232,0.917346,0.930405,0.92383,0.976738
3,0.080400,0.09037,0.924887,0.942329,0.933526,0.979199
4,0.048800,0.087201,0.929068,0.943424,0.936191,0.980666
5,0.048800,0.089661,0.941987,0.948291,0.945128,0.981441
6,0.034300,0.089706,0.943471,0.950359,0.946903,0.98134
7,0.024300,0.087466,0.94667,0.954617,0.950627,0.982166
8,0.020200,0.087515,0.94564,0.956686,0.951131,0.983144
9,0.016100,0.094141,0.946083,0.952184,0.949124,0.982149
10,0.016100,0.088245,0.948823,0.956442,0.952618,0.983059


***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
  _warn_prf(average, modifier, msg_start, len(result))
Saving model checkpoint to /content/test/checkpoint-79
Configuration saved in /content/test/checkpoint-79/config.json
Model weights saved in /content/test/checkpoint-79/pytorch_model.bin
tokenizer config file saved in /content/test/checkpoint-79/tokenizer_config.json
Special tokens file saved in /content/test/checkpoint-79/special_tokens_map.json
added tokens file saved in /content/test/checkpoint-79/added_tokens.json
***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64
Saving model checkpoint to /content/test/checkpoint-158
Configuration saved in /content/test/checkpoint-158/config.json
Model weights saved in /content/test/checkpoint-158/pytorch_model.bin
tokenizer config file saved in /content/test/checkpoint-158/tokenizer_config.json
Special tokens file saved in /content/test/checkpoint-158/special_tokens_map.json
added tokens file saved in 

TrainOutput(global_step=1185, training_loss=0.06427608698229247, metrics={'train_runtime': 495.0326, 'train_samples_per_second': 152.323, 'train_steps_per_second': 2.394, 'total_flos': 2893491843534840.0, 'train_loss': 0.06427608698229247, 'epoch': 15.0})

In [20]:
trainer.evaluate()

***** Running Evaluation *****
  Num examples = 2000
  Batch size = 64


{'epoch': 15.0,
 'eval_accuracy': 0.9830594184576485,
 'eval_f1': 0.95261754726127,
 'eval_loss': 0.08824455738067627,
 'eval_precision': 0.9488231744115873,
 'eval_recall': 0.9564423895851076,
 'eval_runtime': 5.193,
 'eval_samples_per_second': 385.134,
 'eval_steps_per_second': 6.162}

In [21]:
predictions, labels, _ = trainer.predict(datasets['test'])
predictions = np.argmax(predictions, axis=2)

# Remove ignored index (special tokens)
true_predictions = [
    [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results= metric.compute(predictions=true_predictions, references=true_labels)
results

***** Running Prediction *****
  Num examples = 3000
  Batch size = 64


{'AGE': {'f1': 0.9631533847472151,
  'number': 584,
  'precision': 0.9639794168096055,
  'recall': 0.9623287671232876},
 'DATE': {'f1': 0.9849759615384616,
  'number': 1655,
  'precision': 0.9796772265391512,
  'recall': 0.9903323262839879},
 'GENDER': {'f1': 0.9727547931382443,
  'number': 494,
  'precision': 0.9698189134808853,
  'recall': 0.9757085020242915},
 'JOB': {'f1': 0.7976878612716762,
  'number': 174,
  'precision': 0.8023255813953488,
  'recall': 0.7931034482758621},
 'LOCATION': {'f1': 0.9332082942586226,
  'number': 4788,
  'precision': 0.9311707215637347,
  'recall': 0.9352548036758563},
 'NAME': {'f1': 0.9432387312186978,
  'number': 605,
  'precision': 0.9527824620573356,
  'recall': 0.9338842975206612},
 'ORGANIZATION': {'f1': 0.886687306501548,
  'number': 800,
  'precision': 0.8785276073619632,
  'recall': 0.895},
 'PATIENT_ID': {'f1': 0.9825798423890503,
  'number': 2401,
  'precision': 0.9785212722015696,
  'recall': 0.9866722199083715},
 'SYMPTOM_AND_DISEASE': {

In [22]:
from seqeval.metrics import classification_report

print(classification_report(true_predictions, true_labels, digits=3))

                     precision    recall  f1-score   support

                AGE      0.962     0.964     0.963       583
               DATE      0.990     0.980     0.985      1673
             GENDER      0.976     0.970     0.973       497
                JOB      0.793     0.802     0.798       172
           LOCATION      0.935     0.931     0.933      4809
               NAME      0.934     0.953     0.943       593
       ORGANIZATION      0.895     0.879     0.887       815
         PATIENT_ID      0.987     0.979     0.983      2421
SYMPTOM_AND_DISEASE      0.878     0.874     0.876      1169
     TRANSPORTATION      0.977     0.957     0.967       438

          micro avg      0.946     0.941     0.943     13170
          macro avg      0.933     0.929     0.931     13170
       weighted avg      0.946     0.941     0.944     13170

