In [None]:
#Uncomment this cell if you have not already installed these libraries.
#!pip install -q seqeval
#!pip install -q transformers
#!pip install -q datasets
#!pip install -U accelerate
#!pip install -U transformers
#pip install torch torchvision torchaudio
#!pip install torchinfo
#!pip install transformers[torch] for GPU running.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [2]:
def read_file(file_path):
    with open(file_path, "r",encoding="utf8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

train_data = read_file("train_en.tsv")
print(len(train_data))
label_list = sorted(list(set([token_data[2] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}
label_list

131280


['B-ANIM',
 'B-BIO',
 'B-CEL',
 'B-DIS',
 'B-EVE',
 'B-FOOD',
 'B-INST',
 'B-LOC',
 'B-MEDIA',
 'B-MYTH',
 'B-ORG',
 'B-PER',
 'B-PLANT',
 'B-TIME',
 'B-VEHI',
 'I-ANIM',
 'I-BIO',
 'I-CEL',
 'I-DIS',
 'I-EVE',
 'I-FOOD',
 'I-INST',
 'I-LOC',
 'I-MEDIA',
 'I-MYTH',
 'I-ORG',
 'I-PER',
 'I-PLANT',
 'I-TIME',
 'I-VEHI',
 'O']

# Tokenizer and Model

In [3]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [4]:
model_name = 'Models/Best model distillbert after 10 epochs - SystemA'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
#check model again.
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForTokenClassification                        --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           23,839
Total params: 65,214,751
Trainable params: 65,214,751
Non-trainable params: 0

In [7]:
sentence = "His	father was a surveyor and tavern owner who became close friends with William Henry Harrison while the two served together in the War of 1812. "
from transformers import pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer)
res = nlp(sentence)
for item in res:
    print('{message: <10}'.format(message=item['word']),' : ','{message: <10}'.format(message= label_list[int(item['entity'].split('_')[1])]))


His         :  O         
father      :  O         
was         :  O         
a           :  O         
surveyor    :  O         
and         :  O         
tavern      :  O         
owner       :  O         
who         :  O         
became      :  O         
close       :  O         
friends     :  O         
with        :  O         
William     :  B-PER     
Henry       :  I-PER     
Harrison    :  I-PER     
while       :  O         
the         :  O         
two         :  O         
served      :  O         
together    :  O         
in          :  O         
the         :  O         
War         :  B-EVE     
of          :  I-EVE     
1812        :  O         
.           :  O         
