In [1]:
from typing import List
import torch
import torch.nn as nn
import numpy as np

## 1. Data Preparation

In [2]:
from datasets import load_dataset

dataset = load_dataset("conll2003")

In [3]:
dataset = dataset.remove_columns(["id", "pos_tags", "chunk_tags"])
dataset

DatasetDict({
    train: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['tokens', 'ner_tags'],
        num_rows: 3453
    })
})

In [4]:
dataset_train = dataset["train"]
dataset_val = dataset["validation"]
dataset_test = dataset["test"]

In [5]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
MAX_LEN = 113

In [6]:
from torch.utils.data import Dataset

class NER_Dataset(Dataset):
    def __init__(self, dataset, tokenizer):
        super().__init__()
        self.tokens = dataset["tokens"]
        self.labels = dataset["ner_tags"]
        self.tokenizer = tokenizer
        self.max_len = MAX_LEN 

    def __len__(self):
        return len(self.tokens)
    
    def __getitem__(self, idx):
        input_token = self.tokens[idx]
        label_token = self.labels[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)

        input_ids = self.pad_and_truncate(input_token, pad_id= self.tokenizer.pad_token_id)
        labels = self.pad_and_truncate(label_token, pad_id=0)
        attention_mask =  self.pad_and_truncate(attention_mask, pad_id=0)

        return {
            "input_ids": torch.as_tensor(input_ids), 
            "labels": torch.as_tensor(labels),
            "attention_mask": torch.as_tensor(attention_mask)
            }
    
    def pad_and_truncate(self, inputs: List[int], pad_id: int): 
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[:self.max_len]
        return padded_inputs

In [7]:
train_set = NER_Dataset(dataset_train, tokenizer)
val_set = NER_Dataset(dataset_val, tokenizer)
test_set = NER_Dataset(dataset_test, tokenizer)

## 2. Model

In [8]:
from transformers.models.bert.configuration_bert import BertConfig

label2id = {
    'O': 0, 
    'B-PER': 1, 
    'I-PER': 2, 
    'B-ORG': 3, 
    'I-ORG': 4, 
    'B-LOC': 5, 
    'I-LOC': 6, 
    'B-MISC': 7, 
    'I-MISC': 8,
}
id2label = {v:k for k, v in label2id.items()}

config = BertConfig(label2id=label2id, id2label=id2label)
config

BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "classifier_dropout": null,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "id2label": {
    "0": "O",
    "1": "B-PER",
    "2": "I-PER",
    "3": "B-ORG",
    "4": "I-ORG",
    "5": "B-LOC",
    "6": "I-LOC",
    "7": "B-MISC",
    "8": "I-MISC"
  },
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "B-LOC": 5,
    "B-MISC": 7,
    "B-ORG": 3,
    "B-PER": 1,
    "I-LOC": 6,
    "I-MISC": 8,
    "I-ORG": 4,
    "I-PER": 2,
    "O": 0
  },
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.36.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [9]:
config.num_labels

9

In [10]:
from transformers.models.bert import BertForTokenClassification

model = BertForTokenClassification(config)
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, el

## 3. Training

In [11]:
import evaluate

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != 0
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

In [12]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-4,
    per_device_train_batch_size=96,
    per_device_eval_batch_size=96,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    optim="adamw_torch"
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_set,
    eval_dataset=val_set,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.062119,0.2262
2,No log,0.053621,0.347204
3,No log,0.049447,0.404975
4,0.056300,0.047207,0.465884
5,0.056300,0.044351,0.461351
6,0.056300,0.046207,0.507265
7,0.030700,0.048774,0.538998
8,0.030700,0.047174,0.530978
9,0.030700,0.050366,0.547948
10,0.030700,0.051116,0.546786


TrainOutput(global_step=1470, training_loss=0.03625525260458187, metrics={'train_runtime': 1533.6302, 'train_samples_per_second': 91.554, 'train_steps_per_second': 0.959, 'total_flos': 8097818252099580.0, 'train_loss': 0.03625525260458187, 'epoch': 10.0})

## 4. Testing

In [13]:
trainer.evaluate(test_set)

{'eval_loss': 0.04072681441903114,
 'eval_accuracy': 0.4680562407498767,
 'eval_runtime': 11.8965,
 'eval_samples_per_second': 290.254,
 'eval_steps_per_second': 3.026,
 'epoch': 10.0}

## 5. Test sample

In [14]:
# test_sentence = "[UNK] rejects [UNK] call to boycott [UNK] lamb."
test_sentence = "France won the World Cup in Russia in 2018"
inputs = tokenizer(test_sentence, return_tensors="pt", add_special_tokens=False)  # Use the function as training data
inputs

{'input_ids': tensor([[2605, 2180, 1996, 2088, 2452, 1999, 3607, 1999, 2760]]), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1]])}

In [16]:
for key, value in inputs.items():
    inputs[key] = inputs[key].to("cuda")

In [17]:
outputs = model(**inputs)
outputs.logits.shape

torch.Size([1, 9, 9])

In [18]:
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()
preds

array([0, 0, 0, 0, 0, 0, 0, 0, 0])

In [19]:
pred_tags = ""

for pred in preds:
    pred_tags += id2label[pred] + " "

pred_tags

'O O O O O O O O O '