In [1]:
! pip install -q transformers datasets evaluate seqeval


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
! pip install -q accelerate -U


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.1.2[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [3]:
from datasets import load_dataset
import evaluate
from transformers import DataCollatorForTokenClassification, Trainer, TrainingArguments
from transformers import AutoTokenizer, AutoModelForTokenClassification
import numpy as np
import itertools
import collections
import math

### Helper Functions

In [4]:
## Function to tokenize and re-align tokens with labels due to the model tokenizer special tokens added
def tokenize_align_labels(data):
    # tokenize inputs
    tokenized_inputs = tokenizer(data["tokens"], is_split_into_words=True, truncation=True)

    # For storing newly aligned label sequences for each tokenized input
    labels = []

    # iterate on each set of labels
    for i, label in enumerate(data["ner_tags"]):
        # map tokens to their words respectively
        word_ids = tokenized_inputs.word_ids(batch_index=i)

        # keep track of word boundaries
        previous_word_index = None

        # empty list to store new labels for the current sequence
        current_label_ids = []

        for word_index in word_ids:
            # if special token
            if word_index is None:
                current_label_ids.append(-100)
            # new word
            elif word_index != previous_word_index:
                current_label_ids.append(label[word_index])
            # sub token for the same word
            else:
                current_label_ids.append(-100)
            previous_word_index = word_index
        labels.append(current_label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [5]:
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    # (batch size, sequence length, labels percentage prediction)
    predictions = np.argmax(predictions, axis=2)
    #print(predictions)

    true_predictions = [
        [id2label[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [id2label[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }


## Loading Dataset

In [6]:
con_dataset = load_dataset("conll2003")
con_dataset

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 14041
    })
    validation: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3250
    })
    test: Dataset({
        features: ['id', 'tokens', 'pos_tags', 'chunk_tags', 'ner_tags'],
        num_rows: 3453
    })
})

### Exploring dataset

In [7]:
# Select ner_tags feature for analysis
ner_features = con_dataset['train']['ner_tags']
ner_features[0]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [8]:
# Create dictionary for getting tags for analysis while negkecting the tags with I-
tags = {'O': 0, 'B-PER': 1, 'xx': 2, 'B-ORG': 3, 'yy': 4, 'B-LOC': 5, 'zz': 6, 'B-MISC': 7, 'vv': 8}
tags = dict(zip(tags.values(), tags.keys()))

In [9]:
ner_feature_names = [[tags[index] for index in feature] for feature in ner_features]

In [10]:
ner_feature_names[0]

['B-ORG', 'O', 'B-MISC', 'O', 'O', 'O', 'B-MISC', 'O', 'O']

In [11]:
# Flatten feature names for count
dataset_names = itertools.chain.from_iterable(ner_feature_names)

# get count of names
collections.Counter(dataset_names)

Counter({'O': 169578,
         'B-LOC': 7140,
         'B-PER': 6600,
         'B-ORG': 6321,
         'xx': 4528,
         'yy': 3704,
         'B-MISC': 3438,
         'zz': 1157,
         'vv': 1155})

In [12]:
# Check for text lengths
text_length = []
text_feature = con_dataset['train']['tokens']
[text_length.append(len(text_tokens)) for text_tokens in text_feature]
print('maximum length of text', max(text_length))
print('minimum length of text', min(text_length))

maximum length of text 113
minimum length of text 1


### We have the following counts in dataset
#### location        : 7140
#### person          : 6600
#### organization    : 6321
#### miscellaneous   : 3438
#### maximum text len: 113
#### minimum text len: 1

## Model selection and preprocessing

In [13]:
model_name = 'distilbert/distilbert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)

### Data preparation

In [14]:
# Since input is tokenzied we set split into words to true
tokenized_input = tokenizer(con_dataset["train"]["tokens"][0], is_split_into_words=True)
tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])
tokens

['[CLS]',
 'eu',
 'rejects',
 'german',
 'call',
 'to',
 'boycott',
 'british',
 'lamb',
 '.',
 '[SEP]']

In [15]:
# tokenize and re-align tokens with labels due to the model tokenizer special tokens added
con_dataset = con_dataset.map(tokenize_align_labels, batched=True)

In [16]:
con_dataset["train"]["ner_tags"][0]

[3, 0, 7, 0, 0, 0, 7, 0, 0]

In [17]:
con_dataset["train"]["input_ids"][0]

[101, 7327, 19164, 2446, 2655, 2000, 17757, 2329, 12559, 1012, 102]

In [18]:
con_dataset["train"]["labels"][0]

[-100, 3, 0, 7, 0, 0, 0, 7, 0, 0, -100]

In [19]:
# initialize data collator for padding data
data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

In [20]:
label2id = {'O': 0, 'B-PER': 1, 'I-PER': 2, 'B-ORG': 3, 'I-ORG': 4, 'B-LOC': 5, 'I-LOC': 6, 'B-MISC': 7, 'I-MISC': 8}
id2label = dict(zip(label2id.values(), label2id.keys()))

### Model Preparation

In [21]:
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label2id),
                                                        id2label=id2label, label2id=label2id)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [22]:
EPOCHS = 10
BATCH_SIZE = 16
warm_up = int(EPOCHS * len(con_dataset["train"]) * 0.1)
logging_steps = math.ceil(len(con_dataset["train"])/BATCH_SIZE)

In [23]:
training_args = TrainingArguments(output_dir="./distil_bert_ner",
                                  num_train_epochs=EPOCHS,
                                  warmup_steps=warm_up,
                                  per_device_train_batch_size=BATCH_SIZE,
                                  per_device_eval_batch_size=BATCH_SIZE,
                                  eval_strategy="epoch",
                                  save_strategy="epoch",
                                  logging_steps=logging_steps,
                                  weight_decay = 0.01,
                                  learning_rate=2e-5,
                                  metric_for_best_model="eval_loss",
                                  save_total_limit = 1,
                                  load_best_model_at_end = True,
                                )

In [24]:
trainer = Trainer(
    model=model,
    tokenizer=tokenizer,
    args=training_args,
    train_dataset=con_dataset["train"],
    eval_dataset=con_dataset["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [27]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,1.233,0.417926,0.445253,0.463312,0.454103,0.912172
2,0.2438,0.138686,0.77114,0.821104,0.795338,0.967077
3,0.1043,0.074919,0.885289,0.90138,0.893262,0.980141
4,0.0684,0.060287,0.902548,0.918041,0.910229,0.983198
5,0.0495,0.055209,0.917137,0.929485,0.92327,0.984853
6,0.0368,0.050989,0.921223,0.932851,0.927001,0.98604
7,0.027,0.053557,0.924466,0.939246,0.931797,0.986488
8,0.0207,0.054168,0.930023,0.939414,0.934695,0.986916
9,0.0152,0.059191,0.930569,0.940592,0.935554,0.987442
10,0.0119,0.064534,0.928892,0.940929,0.934872,0.987053


TrainOutput(global_step=8780, training_loss=0.18105949432268773, metrics={'train_runtime': 501.8521, 'train_samples_per_second': 279.784, 'train_steps_per_second': 17.495, 'total_flos': 1703092151653326.0, 'train_loss': 0.18105949432268773, 'epoch': 10.0})

In [26]:
#! rm -r distil_bert_ner

In [28]:
trainer.evaluate(con_dataset["test"])

{'eval_loss': 0.11326012760400772,
 'eval_precision': 0.8776753088567949,
 'eval_recall': 0.8930594900849859,
 'eval_f1': 0.8853005704256254,
 'eval_accuracy': 0.9773446753526435,
 'eval_runtime': 2.6168,
 'eval_samples_per_second': 1319.528,
 'eval_steps_per_second': 82.542,
 'epoch': 10.0}