# Ex05 NER

In [1]:
%pip install --upgrade accelerate



### !! NOTE: Please restart the code execution program now.   
After installing this 'accelerate' library, the colab can't directly recognize it, we need to restart the kernel. (But please don't disconnect the runtime. )

In [2]:
import torch
import numpy as np
import pandas as pd
import nltk
import torch.nn as nn
import torch.nn.functional as F
import copy

In [3]:
%pip install datasets
import datasets



## DATA

In [4]:
from datasets import load_dataset

dataset = load_dataset("polyglot_ner", split="train[:6000]", name="de", )

>  Choose one language to work with from that dataset. The following conditions need to hold for the language

Language we choose: German (548k sents)

In [302]:
# Load model directly
from transformers import AutoTokenizer, BertForTokenClassification, BertTokenizerFast

tokenizer = AutoTokenizer.from_pretrained("bert-base-german-cased")
model = BertForTokenClassification.from_pretrained("bert-base-german-cased", num_labels=5)


Some weights of the model checkpoint at bert-base-german-cased were not used when initializing BertForTokenClassification: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-b

In [6]:
# check unique labels
r = []
for y in dataset['ner']:
    r.extend([np.unique(x) for x in y])
print(np.unique(r))

['LOC' 'O' 'ORG' 'PER']


In [7]:
# Set model config id to labels mapping, here a padding label is added for the label padding.
id2label = model.config.id2label = {0:'PAD', 1: 'LOC', 2: 'O', 3: 'ORG', 4: 'PER'}
label2id = model.config.label2id = {model.config.id2label[x]: x for x in id2label.keys()}
id2label, label2id, model.config.id2label, model.config.label2id

({0: 'PAD', 1: 'LOC', 2: 'O', 3: 'ORG', 4: 'PER'},
 {'PAD': 0, 'LOC': 1, 'O': 2, 'ORG': 3, 'PER': 4},
 {0: 'PAD', 1: 'LOC', 2: 'O', 3: 'ORG', 4: 'PER'},
 {'PAD': 0, 'LOC': 1, 'O': 2, 'ORG': 3, 'PER': 4})

In [8]:
# transform the text label to integer (encoding).
dataset = dataset.add_column(name='label', column=[[label2id[x] for x in labels] for labels in dataset['ner']])

In [9]:
# transfer data into tokens (input_ids, attention_mask)
def tokenize(data):
    return tokenizer(data['words'],
                     return_tensors='pt',
                     truncation=True,
                     padding='max_length',
                     max_length=128,
                     return_attention_mask=True,
                     is_split_into_words=True,
                     )
dataset = dataset.map(tokenize, batched=True)
# tokenized = [tokenize(data) for data in dataset]

In [11]:
# adjust label encodings to the same length
def pad_labels(data):
    max_length = 128
    labels = data['label'][:max_length]
    data['label'] = labels + [0] * (max_length - len(labels))
    return data

dataset = dataset.map(pad_labels)

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

In [12]:
# make sure all data samples have the same length
np.unique([len(x) for x in dataset['input_ids']]) , np.unique([len(x) for x in dataset['label']]),

(array([128]), array([128]))

## Part 1

In [13]:
from datasets import Dataset
# prepare datasets for the following 3 tasks: 1000 training data; 3000 training data; 2000 evaluation data
train_1k, train_3k, eval_2k = Dataset.from_dict(dataset[:1000]), Dataset.from_dict(dataset[1000:4000]), Dataset.from_dict(dataset[4000:6000])

### Fine-tuned with 1,000 sentences

In [15]:
task = 'Version-1000'

In [16]:

from sklearn.metrics import f1_score, accuracy_score


In [17]:
# define f1_micro, f1_macro, accuracy metrics computings
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    # print(labels, predictions)
    y_true_list = []
    y_pred_list = []
    for y_true, y_pred in zip(labels, predictions):
      y_true = [label for label in y_true if label != 0]
      y_pred = y_pred[:len(y_true)]
      y_true_list.extend(y_true)
      y_pred_list.extend(y_pred)


    f1_micro = f1_score(y_true_list, y_pred_list, average='micro')
    f1_macro = f1_score(y_true_list, y_pred_list, average='macro')
    accuracy = accuracy_score(y_true_list, y_pred_list)
    return {
        'f1_micro': f1_micro,
        'f1_macro': f1_macro,
        'accuracy': accuracy,
    }

In [18]:
# choose device
if torch.backends.cuda.is_built() :
  device = "cuda:0"
  torch.cuda.set_device(device)
else :
  device = "cpu"


In [19]:
from transformers import TrainingArguments, Trainer, DataCollatorForTokenClassification

# set args for model
training_args = TrainingArguments(
    # use_mps_device=True,
    output_dir=f"./results/{task}",
    overwrite_output_dir=True,
    num_train_epochs=3,
    per_device_train_batch_size=8,
    learning_rate=5e-5,

    save_steps=50,
    save_total_limit=5,
    logging_dir="./logs",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=50,

    do_train=True,
    do_eval=True,
    load_best_model_at_end=True,
    report_to="tensorboard",
    )

In [20]:
# use deepcopy of the intitial model
trainer = Trainer(
    model=copy.deepcopy(model),
    args=training_args,
    train_dataset=train_1k,
    eval_dataset=eval_2k,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [21]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
50,0.195,0.070929,0.893904,0.189072,0.893904
100,0.0736,0.06348,0.893318,0.259725,0.893318
150,0.0524,0.057637,0.896331,0.308275,0.896331
200,0.0486,0.056499,0.915999,0.304018,0.915999
250,0.0419,0.050377,0.909332,0.274644,0.909332
300,0.0327,0.052407,0.909778,0.3203,0.909778
350,0.0268,0.052344,0.90869,0.335643,0.90869


TrainOutput(global_step=375, training_loss=0.06464618968963623, metrics={'train_runtime': 235.4371, 'train_samples_per_second': 12.742, 'train_steps_per_second': 1.593, 'total_flos': 195977882880000.0, 'train_loss': 0.06464618968963623, 'epoch': 3.0})

In [22]:
outputs = trainer.predict(eval_2k)

In [23]:
compute_metrics((outputs.predictions, outputs.label_ids))

{'f1_micro': 0.9093318454456689,
 'f1_macro': 0.2746443834543513,
 'accuracy': 0.9093318454456688}

### Fine-tuned with 3,000 sentences

In [None]:
task = 'Version-3000'

In [24]:
# use deepcopy of the intitial model
trainer = Trainer(
    model=copy.deepcopy(model),
    args=training_args,
    train_dataset=train_3k,
    eval_dataset=eval_2k,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [25]:
trainer.train()

Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
50,0.1943,0.074621,0.916585,0.191407,0.916585
100,0.0711,0.062298,0.89608,0.214839,0.89608
150,0.0638,0.055763,0.908858,0.213396,0.908858
200,0.0517,0.050621,0.909332,0.240253,0.909332
250,0.0593,0.052458,0.918482,0.259189,0.918482
300,0.0473,0.050202,0.915637,0.332634,0.915637
350,0.0466,0.045403,0.905036,0.295236,0.905036
400,0.0402,0.050094,0.881239,0.344633,0.881239
450,0.0377,0.045806,0.918817,0.382363,0.918817
500,0.0369,0.042476,0.914019,0.346649,0.914019


TrainOutput(global_step=1125, training_loss=0.04410023824373881, metrics={'train_runtime': 683.1194, 'train_samples_per_second': 13.175, 'train_steps_per_second': 1.647, 'total_flos': 587933648640000.0, 'train_loss': 0.04410023824373881, 'epoch': 3.0})

In [26]:
outputs = trainer.predict(eval_2k)

In [27]:
compute_metrics((outputs.predictions, outputs.label_ids))

{'f1_micro': 0.9201004324173525,
 'f1_macro': 0.41321798818924504,
 'accuracy': 0.9201004324173525}

### Fine-tuned with 3,000 sentences and frozen embeddings

In [None]:
task = 'Version-3000-frozen'

In [28]:
# use deepcopy of the intitial model
frozen_model = copy.deepcopy(model)

# frozen embeddings
for param in frozen_model.base_model.embeddings.parameters():
    param.requires_grad = False

In [29]:
trainer = Trainer(
    model=frozen_model,
    args=training_args,
    train_dataset=train_3k,
    eval_dataset=eval_2k,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)

In [30]:
trainer.train()

Step,Training Loss,Validation Loss,F1 Micro,F1 Macro,Accuracy
50,0.1947,0.071764,0.914074,0.191555,0.914074
100,0.0705,0.061466,0.889748,0.220857,0.889748
150,0.0628,0.055021,0.905008,0.223296,0.905008
200,0.0519,0.050029,0.914577,0.234236,0.914577
250,0.0609,0.051193,0.918064,0.260204,0.918064
300,0.0448,0.048844,0.903417,0.304763,0.903417
350,0.0459,0.044736,0.908049,0.28399,0.908049
400,0.0399,0.046062,0.897057,0.337447,0.897057
450,0.0368,0.042502,0.911982,0.372316,0.911982
500,0.0365,0.044496,0.910922,0.337136,0.910922


TrainOutput(global_step=1125, training_loss=0.04414169340663486, metrics={'train_runtime': 658.0318, 'train_samples_per_second': 13.677, 'train_steps_per_second': 1.71, 'total_flos': 587933648640000.0, 'train_loss': 0.04414169340663486, 'epoch': 3.0})

In [31]:
outputs = trainer.predict(eval_2k)

In [32]:
compute_metrics((outputs.predictions, outputs.label_ids))

{'f1_micro': 0.9203236155670247,
 'f1_macro': 0.4123807323168641,
 'accuracy': 0.9203236155670247}