In [None]:
!pip -q install evaluate accelerate -U

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m3.1 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m270.9/270.9 kB[0m [31m9.7 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m507.1/507.1 kB[0m [31m13.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m14.0 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.5 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from typing import List
import numpy as np
import torch
import evaluate
from sklearn.model_selection import train_test_split

In [None]:
import nltk
nltk.download('treebank')

[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Unzipping corpora/treebank.zip.


True

## 1. Load pre-trained checkpoints

In [None]:
from transformers import AutoTokenizer, AutoModelForTokenClassification

model_name = "QCRI/bert-base-multilingual-cased-pos-english"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of the model checkpoint at QCRI/bert-base-multilingual-cased-pos-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
model

BertForTokenClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(119547, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, e

## 2. Data Preparation

### 2.1 Load data from nltk corpus

In [None]:
#BERT => POS En => Domain

In [None]:
tagged_sentences = nltk.corpus.treebank.tagged_sents()

print("Number of samples:", len(tagged_sentences))

Number of samples: 3914


In [None]:
tagged_sentences[0]

[('Pierre', 'NNP'),
 ('Vinken', 'NNP'),
 (',', ','),
 ('61', 'CD'),
 ('years', 'NNS'),
 ('old', 'JJ'),
 (',', ','),
 ('will', 'MD'),
 ('join', 'VB'),
 ('the', 'DT'),
 ('board', 'NN'),
 ('as', 'IN'),
 ('a', 'DT'),
 ('nonexecutive', 'JJ'),
 ('director', 'NN'),
 ('Nov.', 'NNP'),
 ('29', 'CD'),
 ('.', '.')]

In [None]:
sentences, sentence_tags =[], []
for tagged_sentence in tagged_sentences:
    sentence, tag = zip(*tagged_sentence)
    sentences.append(sentence)
    sentence_tags.append(tag)

In [None]:
sentences[0], sentence_tags[0]

(('Pierre',
  'Vinken',
  ',',
  '61',
  'years',
  'old',
  ',',
  'will',
  'join',
  'the',
  'board',
  'as',
  'a',
  'nonexecutive',
  'director',
  'Nov.',
  '29',
  '.'),
 ('NNP',
  'NNP',
  ',',
  'CD',
  'NNS',
  'JJ',
  ',',
  'MD',
  'VB',
  'DT',
  'NN',
  'IN',
  'DT',
  'JJ',
  'NN',
  'NNP',
  'CD',
  '.'))

### 2.2 Load dictionary to encode label from model config

In [None]:
from collections import defaultdict

In [None]:
model.config.label2id

{'#': 7,
 '$': 6,
 "''": 5,
 ',': 2,
 '-LRB-': 17,
 '-RRB-': 32,
 '.': 4,
 ':': 3,
 'CC': 8,
 'CD': 9,
 'DT': 10,
 'EX': 11,
 'FW': 12,
 'IN': 13,
 'JJ': 14,
 'JJR': 15,
 'JJS': 16,
 'LS': 18,
 'MD': 19,
 'NN': 20,
 'NNP': 21,
 'NNPS': 22,
 'NNS': 23,
 'O': 0,
 'PDT': 24,
 'POS': 25,
 'PRP': 26,
 'PRP$': 27,
 'RB': 28,
 'RBR': 29,
 'RBS': 30,
 'RP': 31,
 'SYM': 33,
 'TO': 34,
 'UH': 35,
 'VB': 36,
 'VBD': 37,
 'VBG': 38,
 'VBN': 39,
 'VBP': 40,
 'VBZ': 41,
 'WDT': 42,
 'WP': 43,
 'WP$': 44,
 'WRB': 45,
 '``': 1}

In [None]:
label2id = defaultdict(int, model.config.label2id)
id2label = {id: label for label, id in label2id.items()}

In [None]:
id2label

{7: '#',
 6: '$',
 5: "''",
 2: ',',
 17: '-LRB-',
 32: '-RRB-',
 4: '.',
 3: ':',
 8: 'CC',
 9: 'CD',
 10: 'DT',
 11: 'EX',
 12: 'FW',
 13: 'IN',
 14: 'JJ',
 15: 'JJR',
 16: 'JJS',
 18: 'LS',
 19: 'MD',
 20: 'NN',
 21: 'NNP',
 22: 'NNPS',
 23: 'NNS',
 0: 'O',
 24: 'PDT',
 25: 'POS',
 26: 'PRP',
 27: 'PRP$',
 28: 'RB',
 29: 'RBR',
 30: 'RBS',
 31: 'RP',
 33: 'SYM',
 34: 'TO',
 35: 'UH',
 36: 'VB',
 37: 'VBD',
 38: 'VBG',
 39: 'VBN',
 40: 'VBP',
 41: 'VBZ',
 42: 'WDT',
 43: 'WP',
 44: 'WP$',
 45: 'WRB',
 1: '``'}

### 2.3 Split dataset into train, val and test set

In [None]:
train_sentences, test_sentences, train_tags, test_tags = train_test_split(
    sentences,
    sentence_tags,
    test_size=0.3
)

In [None]:
valid_sentences, test_sentences, valid_tags, test_tags = train_test_split(
    test_sentences,
    test_tags,
    test_size=0.5
)

### 2.4 Build dataset

In [None]:
from torch.utils.data import Dataset

MAX_LEN = max([len(sentence) for sentence in train_sentences])

class PosTagging_Dataset(Dataset):
    def __init__(self,
                 sentences: List[List[str]],
                 tags: List[List[str]],
                 tokenizer,
                 label2id,
                 max_len=MAX_LEN
                ):
        super().__init__()
        self.sentences = sentences
        self.tags = tags
        self.max_len = max_len
        self.tokenizer = tokenizer
        self.label2id = label2id

    def __len__(self):
        return len(self.sentences)

    def __getitem__(self, idx):
        sentence = self.sentences[idx]
        tag = self.tags[idx]

        input_token = self.tokenizer.convert_tọkens_to_ids(sentence) # word-based tokenization
        attention_mask = [1] * len(input_token)
        label = [self.label2id[token] for token in tag]

        return {
            "input_ids": self.pad_and_truncate(input_token, pad_id=self.tokenizer.pad_token_id),
            "attention_mask": self.pad_and_truncate(attention_mask, pad_id=0),
            "labels": self.pad_and_truncate(attention_mask, pad_id=self.label2id["0"])
        }

    def pad_and_truncate(self, inputs: List[int], pad_id: int):
        if len(inputs) < self.max_len:
            pad_inputs = inputs + [pad_id]*(self.max_len - len(inputs))
        else:
            pad_inputs = inputs[:self.max_len]
        return torch.as_tensor(pad_inputs)

In [None]:
sentences[0]

('Pierre',
 'Vinken',
 ',',
 '61',
 'years',
 'old',
 ',',
 'will',
 'join',
 'the',
 'board',
 'as',
 'a',
 'nonexecutive',
 'director',
 'Nov.',
 '29',
 '.')

### 2.5 Dataset loader

In [None]:
train_dataset = PosTagging_Dataset(train_sentences, train_tags, tokenizer, label2id)
val_dataset = PosTagging_Dataset(valid_sentences, valid_tags, tokenizer, label2id)
test_dataset = PosTagging_Dataset(test_sentences, test_tags, tokenizer, label2id)

## 3. Train model

In [None]:
accuracy = evaluate.load("accuracy")
ignore_label = len(label2id)

def compute_metrics(eval_pred):
    preditions, labels = eval_pred
    mask = labels != ignore_label
    predictions = np.argmax(preditions, dim=1)
    return accuracy.compute(predictions=predictions[mask], references=labels[mask])

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

In [None]:
from transformers import TrainingArguments, Trainer

training_args = TrainingArguments(
    output_dir="out_dir",
    learning_rate=1e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    tokenizer = tokenizer,
    compute_metrics=compute_metrics,
)

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.044003,0.987358
2,No log,0.035222,0.989684
3,0.145200,0.0316,0.990709
4,0.145200,0.029464,0.991331
5,0.145200,0.028457,0.991608
6,0.029700,0.02755,0.991859
7,0.029700,0.026936,0.992067
8,0.029700,0.026884,0.992142
9,0.023700,0.02668,0.992155
10,0.023700,0.026557,0.992224


TrainOutput(global_step=1720, training_loss=0.060516599167224974, metrics={'train_runtime': 1775.7326, 'train_samples_per_second': 15.425, 'train_steps_per_second': 0.969, 'total_flos': 3789641345256360.0, 'train_loss': 0.060516599167224974, 'epoch': 10.0})

In [None]:
trainer.train()

You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.051634,0.985287
2,No log,0.04128,0.987875
3,0.150500,0.037712,0.988907
4,0.150500,0.03571,0.989532
5,0.150500,0.034171,0.989985
6,0.030800,0.03348,0.990071
7,0.030800,0.033291,0.990371
8,0.030800,0.032909,0.990437
9,0.024600,0.032835,0.990484
10,0.024600,0.032753,0.990457


TrainOutput(global_step=1720, training_loss=0.06279984684877618, metrics={'train_runtime': 807.5753, 'train_samples_per_second': 33.916, 'train_steps_per_second': 2.13, 'total_flos': 3579882599208960.0, 'train_loss': 0.06279984684877618, 'epoch': 10.0})

## 4. Test model

In [None]:
trainer.evaluate(test_dataset)

{'eval_loss': 0.0303813386708498,
 'eval_accuracy': 0.9913397093154604,
 'eval_runtime': 11.421,
 'eval_samples_per_second': 51.484,
 'eval_steps_per_second': 3.24,
 'epoch': 10.0}

## 5. Inference

In [None]:
test_sentence = "We are exploring the topic of deep learning"

In [None]:
test_sentence.split()

['We', 'are', 'exploring', 'the', 'topic', 'of', 'deep', 'learning']

In [None]:
tokenizer.tokenize("we are")

['we', 'are']

In [None]:
tokenizer(["We are exploring the topic of deep learning"])

{'input_ids': [[101, 12865, 10301, 11419, 79893, 10105, 57680, 10108, 26591, 26901, 102]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [None]:
input = torch.as_tensor([tokenizer.convert_tokens_to_ids(test_sentence.split())])

In [None]:
input

tensor([[12865, 10301,   100, 10105, 57680, 10108, 26591, 26901]])

In [None]:
output = model(input)

In [None]:
output.logits.shape

torch.Size([1, 8, 46])

In [None]:
_, preds = torch.max(output.logits, -1)

In [None]:
preds

tensor([[ 4, 40,  1, 10, 20, 13, 14, 20]])

In [None]:
id2label[4]

'.'