In [1]:
#Uncomment this cell if you have not already installed these libraries.
#!pip install -q seqeval
#!pip install -q transformers
#!pip install -q datasets
#!pip install -U accelerate
#!pip install -U transformers
#pip install torch torchvision torchaudio
#!pip install torchinfo
#!pip install transformers[torch] for GPU running.

In [13]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [14]:
def read_file(file_path):
    with open(file_path, "r",encoding="utf8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [15]:
train_data = read_file("train_en.tsv")
#validation_data = read_file("dev_en.tsv")
#test_data = read_file("test_en.tsv")
#note test data has only 30 labels. Missing 'I-BIO' comparing to training and validation sets

In [16]:
print(len(train_data))


131280


In [17]:
#keep only file type of tags
List_New = ['PER', 'ORG', 'LOC', 'DIS', 'ANIM']
def Data_Processing(data):
    for i in range(len(data)):
        sentence = data[i]
        for j in range(len(sentence)):
            s= sentence[j]
            temp = s[2].split('-')
            if (len(temp)>=2): 
                if (temp[1] not in  List_New):
                    data[i][j][2]= 'O' 
    return data

train_dataset = Data_Processing(train_data)


In [18]:
print(len(train_data))


131280


In [19]:
def convert_to_dataset(data, label_map):
    formatted_data = {"id": [], "tokens": [], "ner_tags": []}
    i= 0
    for sentence in data:
        tokens = [token_data[1] for token_data in sentence]
        ner_tags = [label_map[token_data[2]] for token_data in sentence]
        formatted_data["id"].append(i)
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
        i+=1
    return Dataset.from_dict(formatted_data)

In [20]:
label_list = sorted(list(set([token_data[2] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}
label_list

['B-ANIM',
 'B-DIS',
 'B-LOC',
 'B-ORG',
 'B-PER',
 'I-ANIM',
 'I-DIS',
 'I-LOC',
 'I-ORG',
 'I-PER',
 'O']

# Tokenizer and Model

In [21]:
model_name = 'Models/Best model distillbert after 10 epochs - SystemB'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))
#check model again.
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForTokenClassification                        --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           8,459
Total params: 65,199,371
Trainable params: 65,199,371
Non-trainable params: 0

In [22]:
sentence = "His	father was a surveyor and tavern owner who became close friends with William Henry Harrison while the two served together in the War of 1812. "
from transformers import pipeline
nlp = pipeline('ner', model=model, tokenizer=tokenizer)
res = nlp(sentence)
for item in res:
    print('{message: <10}'.format(message=item['word']),' : ','{message: <10}'.format(message= label_list[int(item['entity'].split('_')[1])]))


His         :  O         
father      :  O         
was         :  O         
a           :  O         
surveyor    :  O         
and         :  O         
tavern      :  O         
owner       :  O         
who         :  O         
became      :  O         
close       :  O         
friends     :  O         
with        :  O         
William     :  B-PER     
Henry       :  I-PER     
Harrison    :  I-PER     
while       :  O         
the         :  O         
two         :  O         
served      :  O         
together    :  O         
in          :  O         
the         :  O         
War         :  O         
of          :  O         
1812        :  O         
.           :  O         
