In [9]:
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split
import nltk

nltk.download("treebank")

[nltk_data] Downloading package treebank to /home/hongong/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [53]:
# tokenization
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    Trainer,
    TrainingArguments,
)
from torch.utils.data import Dataset

# Load dataset

In [10]:
# load tree bank dataset
tagged_sentences = nltk.corpus.treebank.tagged_sents()
print("Number of samples: ", len(tagged_sentences))

Number of samples:  3914


In [2]:
tagged_sentences[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [8]:
# save sentences and tags
sentences, sentence_tags = [], []
for tagged_sentence in tagged_sentences:
    sentence, tags = zip(*tagged_sentence)
    sentences.append([word.lower() for word in sentence])
    sentence_tags.append([tag for tag in tags])

# Preprocessing

In [11]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences, sentence_tags, test_size=0.3
)
valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences, test_tags, test_size=0.5
)

In [13]:
# build dataset
model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
MAX_LEN = 256

In [40]:
class PosTaggingDataset(Dataset):
    def __init__(
        self,
        sentences: List[List[str]],
        tags: List[List[str]],
        tokenizer,
        label2id: dict,
        max_len=MAX_LEN,
    ):
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        input_token = self.sentences[idx]
        label_token = self.tags[idx]

        input_token = self.tokenizer.convert_tokens_to_ids(input_token)
        attention_mask = [1] * len(input_token)
        labels = [self.label2id[token] for token in label_token]

        return {
            "input_ids": self.pad_and_truncate(
                input_token, pad_id=self.tokenizer.pad_token_id
            ),
            "labels": self.pad_and_truncate(labels, pad_id=self.label2id["0"]),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0),
        }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            padded_inputs = inputs + [pad_id] * (self.max_len - len(inputs))
        else:
            padded_inputs = inputs[: self.max_len]
        return torch.as_tensor(padded_inputs)

In [41]:
def build_label2id(tokens: List[List[str]]):
    label2id = {}
    id_counter = 0
    for token in [token for sublist in tokens for token in sublist]:
        if token not in label2id:
            label2id[token] = id_counter
            id_counter += 1
    return label2id

In [42]:
label2id = build_label2id(sentence_tags)
print(len(label2id))
label2id

46


{'NNP': 0,
 ',': 1,
 'CD': 2,
 'NNS': 3,
 'JJ': 4,
 'MD': 5,
 'VB': 6,
 'DT': 7,
 'NN': 8,
 'IN': 9,
 '.': 10,
 'VBZ': 11,
 'VBG': 12,
 'CC': 13,
 'VBD': 14,
 'VBN': 15,
 '-NONE-': 16,
 'RB': 17,
 'TO': 18,
 'PRP': 19,
 'RBR': 20,
 'WDT': 21,
 'VBP': 22,
 'RP': 23,
 'PRP$': 24,
 'JJS': 25,
 'POS': 26,
 '``': 27,
 'EX': 28,
 "''": 29,
 'WP': 30,
 ':': 31,
 'JJR': 32,
 'WRB': 33,
 '$': 34,
 'NNPS': 35,
 'WP$': 36,
 '-LRB-': 37,
 '-RRB-': 38,
 'PDT': 39,
 'RBS': 40,
 'FW': 41,
 'UH': 42,
 'SYM': 43,
 'LS': 44,
 '#': 45}

In [43]:
label2id["0"] = -100

In [56]:
id2label = {v: k for k, v in label2id.items()}
id2label

{0: 'NNP',
 1: ',',
 2: 'CD',
 3: 'NNS',
 4: 'JJ',
 5: 'MD',
 6: 'VB',
 7: 'DT',
 8: 'NN',
 9: 'IN',
 10: '.',
 11: 'VBZ',
 12: 'VBG',
 13: 'CC',
 14: 'VBD',
 15: 'VBN',
 16: '-NONE-',
 17: 'RB',
 18: 'TO',
 19: 'PRP',
 20: 'RBR',
 21: 'WDT',
 22: 'VBP',
 23: 'RP',
 24: 'PRP$',
 25: 'JJS',
 26: 'POS',
 27: '``',
 28: 'EX',
 29: "''",
 30: 'WP',
 31: ':',
 32: 'JJR',
 33: 'WRB',
 34: '$',
 35: 'NNPS',
 36: 'WP$',
 37: '-LRB-',
 38: '-RRB-',
 39: 'PDT',
 40: 'RBS',
 41: 'FW',
 42: 'UH',
 43: 'SYM',
 44: 'LS',
 45: '#',
 -100: '0'}

In [44]:
train_dataset = PosTaggingDataset(train_sentences, train_tags, tokenizer, label2id)
valid_dataset = PosTaggingDataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTaggingDataset(test_sentences, test_tags, tokenizer, label2id)

In [45]:
sample_data = next(iter(train_dataset))
sample_data.keys()

dict_keys(['input_ids', 'labels', 'attention_mask'])

In [47]:
sample_data["input_ids"]

tensor([10261, 10379,   100, 10189, 45809, 26686, 10309,   100,   100,   100,
          119,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0, 

In [48]:
sample_data["labels"]

tensor([  19,   17,   14,    9,    4,    3,   14,   17,   15,   16,   10, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100, -100,
        -100, -100, -100, -100, -100, -1

In [49]:
sample_data["attention_mask"]

tensor([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])

# Modeling

In [51]:
model = AutoModelForTokenClassification.from_pretrained(model_name)
model

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-1

# Metric

In [52]:
accuracy = evaluate.load("accuracy")
ignore_label = len(label2id)


def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(predictions, axis=-1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

# Training

In [54]:
training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

  trainer = Trainer(


In [55]:
%%time
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.614863,0.081559
2,No log,0.446493,0.086204
3,0.983500,0.393634,0.087475
4,0.983500,0.369075,0.088167
5,0.983500,0.35722,0.088606
6,0.323600,0.350754,0.088666
7,0.323600,0.346659,0.088899
8,0.323600,0.343067,0.089145
9,0.256800,0.344454,0.089025
10,0.256800,0.343828,0.089072


CPU times: user 19min 30s, sys: 13.2 s, total: 19min 43s
Wall time: 19min 44s


TrainOutput(global_step=1720, training_loss=0.4843847385672636, metrics={'train_runtime': 1184.2223, 'train_samples_per_second': 23.129, 'train_steps_per_second': 1.452, 'total_flos': 3579882599208960.0, 'train_loss': 0.4843847385672636, 'epoch': 10.0})

# Inference

In [57]:
# tokenization
test_sentence = "We are exploring the topic of deep learning"
input = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())])
input = input.to("cuda")

# prediction
outputs = model(input)
_, preds = torch.max(outputs.logits, -1)
preds = preds[0].cpu().numpy()

# decode
pred_tags = ""
for pred in preds:
    pred_tags += id2label[pred] + " "
print(pred_tags)

NNS VBP RB DT NN IN JJ NN 
