In [None]:
!pip install evaluate seqeval
!pip install datasets==2.15.0
!pip install accelerate -U



In [None]:
from datasets import load_dataset
from transformers import (AutoTokenizer,
                          AutoModelForTokenClassification,
                          Trainer,
                          TrainingArguments,
                          DataCollatorForTokenClassification,
                         )
from collections import defaultdict

from tqdm import tqdm
from transformers.pipelines.pt_utils import KeyDataset
import torch
import os
import numpy as np
import evaluate

device = 'cuda' if torch.cuda.is_available() else 'cpu'
system = 'B'
metric = evaluate.load("seqeval")
dataset = load_dataset("Babelscape/multinerd")

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/20 [00:00<?, ?it/s]

In [None]:
print(np.unique(dataset['train']['lang']))
dataset = dataset.filter(lambda example: example['lang'] == 'en')
print(np.unique(dataset['train']['lang']))

['de' 'en' 'es' 'fr' 'it' 'nl' 'pl' 'pt' 'ru' 'zh']
['en']


In [None]:
ner_tags_dict = {
    "O": 0,
    "B-PER": 1,
    "I-PER": 2,
    "B-ORG": 3,
    "I-ORG": 4,
    "B-LOC": 5,
    "I-LOC": 6,
    "B-ANIM": 7,
    "I-ANIM": 8,
    "B-BIO": 9,
    "I-BIO": 10,
    "B-CEL": 11,
    "I-CEL": 12,
    "B-DIS": 13,
    "I-DIS": 14,
    "B-EVE": 15,
    "I-EVE": 16,
    "B-FOOD": 17,
    "I-FOOD": 18,
    "B-INST": 19,
    "I-INST": 20,
    "B-MEDIA": 21,
    "I-MEDIA": 22,
    "B-MYTH": 23,
    "I-MYTH": 24,
    "B-PLANT": 25,
    "I-PLANT": 26,
    "B-TIME": 27,
    "I-TIME": 28,
    "B-VEHI": 29,
    "I-VEHI": 30,
  }

In [None]:
label_list = [key for key in ner_tags_dict.keys()]
if system == 'A':
  id2label = {i: label for i, label in enumerate(label_list)}
  label2id = {v: k for k, v in id2label.items()}
elif system == 'B':
  allowed_tags = ['O', 'B-PER', 'I-PER', 'B-ORG', 'I-ORG',
                        'B-LOC', 'I-LOC', 'B-ANIM', 'I-ANIM', 'B-DIS', 'I-DIS']

  allowed_values = [i[1] for i in ner_tags_dict.items() if i[0] in allowed_tags]
  tags_values = {i: j for j, i in enumerate(allowed_values)}
  def replace_values(example):
    feature_values = example['ner_tags']
    #set any values outside of allowed_tags to 0
    replaced_values = [val if val in allowed_values else 0 for val in feature_values]
    replaced_values = [tags_values[i] for i in replaced_values]
    example['ner_tags'] = replaced_values
    return example

  dataset = dataset.map(replace_values)
  label_list = [i for i in allowed_tags]
  id2label = {i: label for i, label in enumerate(label_list)}
  label2id = {v: k for k, v in id2label.items()}

Map:   0%|          | 0/262560 [00:00<?, ? examples/s]

Map:   0%|          | 0/32820 [00:00<?, ? examples/s]

Map:   0%|          | 0/32908 [00:00<?, ? examples/s]

In [None]:
model_name_or_path = 'distilbert-base-cased'
tokenizer_name_or_path = 'distilbert-base-cased'
tokenizer = AutoTokenizer.from_pretrained(tokenizer_name_or_path)
model = AutoModelForTokenClassification.from_pretrained(
    model_name_or_path,
    num_labels = len(label_list),
    id2label=id2label,
    label2id=label2id,
)
model.to(device)

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForTokenClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
    

In [None]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["tokens"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples[f"ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)  # Map tokens to their respective word.
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:  # Set the special tokens to -100.
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:  # Only label the first token of a given word.
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs


def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
tokenized_datasets = dataset.map(
    tokenize_and_align_labels,
    batched=True,
    remove_columns=dataset["train"].column_names,
)

data_collator = DataCollatorForTokenClassification(tokenizer=tokenizer)

Map:   0%|          | 0/262560 [00:00<?, ? examples/s]

Map:   0%|          | 0/32820 [00:00<?, ? examples/s]

Map:   0%|          | 0/32908 [00:00<?, ? examples/s]

In [None]:
output_dir = "./distilbert-base-cased-system-A" if system == 'A' else "./distilbert-base-cased-system-B"


args = TrainingArguments(
    output_dir,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    num_train_epochs=1,
    weight_decay=0.01,
    push_to_hub=False,
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer,
)
trainer.train()

You're using a DistilBertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.0118,0.020739,0.949196,0.953687,0.951436,0.993341


TrainOutput(global_step=32820, training_loss=0.020948419783044195, metrics={'train_runtime': 2035.6516, 'train_samples_per_second': 128.981, 'train_steps_per_second': 16.123, 'total_flos': 3333200773424832.0, 'train_loss': 0.020948419783044195, 'epoch': 1.0})

In [None]:
def compute_inference_metrics(label, pred):

    true_predictions = [[label_list[p] for (p, l) in zip(pred, label) if l != -100]]
    true_labels = [[label_list[l] for (p, l) in zip(pred, label) if l != -100]]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [None]:
tokenizer = AutoTokenizer.from_pretrained(f"{output_dir}/"+os.listdir(output_dir)[-1])
model = AutoModelForTokenClassification.from_pretrained(f"{output_dir}/"+os.listdir(output_dir)[-1]).to(device)
scores = []
model = model.eval()
for inputs in tqdm(tokenized_datasets['test']):
  label = inputs['labels']
  with torch.no_grad():
      inputs = {'input_ids': torch.Tensor([inputs['input_ids']]).long().to(device),
                'attention_mask': torch.Tensor([inputs['attention_mask']]).long().to(device)}
      logits = model(**inputs).logits
      pred = np.argmax(logits.cpu().numpy(), axis = 2)[0]
  score = compute_inference_metrics(label, pred)
  scores.append(score)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  avg = a.mean(axis, **keepdims_kw)
  ret = ret.dtype.type(ret / rcount)
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
100%|██████████| 32908/32908 [06:17<00:00, 87.19it/s]


In [None]:
summed_values = defaultdict(int)

for d in scores:
    for key, value in d.items():
        summed_values[key] += value

# Convert the summed_values defaultdict to a regular dictionary
summed_dict = dict(summed_values)

final_score = {key: score/len(tokenized_datasets['test']['labels']) for key, score in summed_dict.items()}
final_score

{'precision': 0.8639391675923112,
 'recall': 0.8675558978722345,
 'f1': 0.8638551031245556,
 'accuracy': 0.9942459420054489}