In [1]:
#Uncomment this cell if you have not already installed these libraries.
#!pip install -q seqeval
#!pip install -q transformers
#!pip install -q datasets
#!pip install -U accelerate
#!pip install -U transformers
#!pip install torchinfo
#!pip install transformers[torch] for GPU running.

In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [3]:
def read_file(file_path):
    with open(file_path, "r",encoding="utf8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [16]:
train_data = read_file("train_en.tsv")
validation_data = read_file("dev_en.tsv")
test_data = read_file("test_en.tsv")
#note test data has only 30 labels. Missing 'I-BIO' comparing to training and validation sets

In [17]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))

131280
16410
16454


# Statistic for all data

In [18]:
label_list = sorted(list(set([token_data[2] for sentence in train_data for token_data in sentence])))
#counting each labels on the data
Counting_Train = {}
Counting_Validations = {}
Counting_Test = {}
Counting_Sum = {}
for i in label_list:
    Counting_Train[i] = 0
    Counting_Validations[i] = 0
    Counting_Test[i] = 0
    Counting_Sum[i] = 0
for sentence in train_data:
    for token_data in sentence:
        Counting_Train[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in validation_data:
    for token_data in sentence:
        Counting_Validations[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in test_data:
    for token_data in sentence:
        Counting_Test[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

print('Ratio of each Label in \n\t\t training\t\t:\t\tValidation\t\t:\t\tTesting')
for i in label_list:
    print('{message: <10}'.format(message=i), ' \t','{message: <16}'.format(message=Counting_Train[i]/Counting_Sum[i]),'\t\t','{message: <16}'.format(message=Counting_Validations[i]/Counting_Sum[i]),
          '\t\t\t','{message: <16}'.format(message=Counting_Test[i]/Counting_Sum[i]))
    
print(Counting_Train)
print(Counting_Validations)
print(Counting_Test)

Ratio of each Label in 
		 training		:		Validation		:		Testing
B-ANIM      	 0.823058032829262 		 0.0732842186894145 			 0.10365774848132352
B-BIO       	 0.8641975308641975 		 0.08641975308641975 			 0.04938271604938271
B-CEL       	 0.9521276595744681 		 0.03333333333333333 			 0.014539007092198582
B-DIS       	 0.7800286841161707 		 0.15193617784152025 			 0.06803513804230907
B-EVE       	 0.7950251889168766 		 0.09414357682619648 			 0.11083123425692695
B-FOOD      	 0.7550387596899225 		 0.19334245326037391 			 0.0516187870497036
B-INST      	 0.8917647058823529 		 0.08             			 0.02823529411764706
B-LOC       	 0.7469537427265435 		 0.0999503431416239 			 0.1530959141318326
B-MEDIA     	 0.8153660498793243 		 0.1232233842853312 			 0.0614105658353446
B-MYTH      	 0.8660578386605784 		 0.0852359208523592 			 0.0487062404870624
B-ORG       	 0.8205242378365541 		 0.0812479591533826 			 0.09822780301006323
B-PER       	 0.8314127694399346 		 0.09909053709790255 			 0.0694966

# Statistic for a small sample (Randomly)


In [19]:
#Select 10% to do this task because of limitation of time and memory. 2.5% for trainingset.
import random
random.seed(1234) #make sure training data has 31 labels as original data
train_data = random.sample(train_data, int(0.025*len(train_data)))

random.seed(30) #make sure test data has 30 labels as original data
test_data = random.sample(test_data, int(0.15*len(test_data)))
random.seed(2) #make sure validation data has 31 labels as original data
validation_data = random.sample(validation_data, int(0.15*len(validation_data)))


print(len(train_data))
print(len(validation_data))
print(len(test_data))
#Ratio: ~ 4:3:3 It is not a good ratio. However, with limitations of machine and time. It is good to learn and practice.

3282
2461
2468


In [20]:
List_New = ['PER', 'ORG', 'LOC', 'DIS', 'ANIM']
def convert_to_dataset2(data):
    for i in range(len(data)):
        sentence = data[i]
        for j in range(len(sentence)):
            s= sentence[j]
            temp = s[2].split('-')
            if (len(temp)>=2): 
                if (temp[1] not in  List_New):
                    data[i][j][2]= 'O' 
    return data

label_list = sorted(list(set([token_data[2] for sentence in train_data for token_data in sentence])))
#counting each labels on the data
Counting_Train = {}
Counting_Validations = {}
Counting_Test = {}
Counting_Sum = {}
for i in label_list:
    Counting_Train[i] = 0
    Counting_Validations[i] = 0
    Counting_Test[i] = 0
    Counting_Sum[i] = 0
for sentence in train_data:
    for token_data in sentence:
        Counting_Train[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in validation_data:
    for token_data in sentence:
        Counting_Validations[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in test_data:
    for token_data in sentence:
        Counting_Test[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

print('Ratio of each Label in \n\t\t training\t\t:\t\tValidation\t\t:\t\tTesting')
for i in label_list:
    print('{message: <10}'.format(message=i), ' \t','{message: <16}'.format(message=Counting_Train[i]/Counting_Sum[i]),'\t\t','{message: <16}'.format(message=Counting_Validations[i]/Counting_Sum[i]),
          '\t\t\t','{message: <16}'.format(message=Counting_Test[i]/Counting_Sum[i]))
    
print(Counting_Train)
print(Counting_Validations)
print(Counting_Test)

Ratio of each Label in 
		 training		:		Validation		:		Testing
B-ANIM      	 0.4748677248677249 		 0.19973544973544974 			 0.3253968253968254
B-BIO       	 0.5              		 0.4              			 0.1             
B-CEL       	 0.6716417910447762 		 0.2835820895522388 			 0.04477611940298507
B-DIS       	 0.37748344370860926 		 0.4139072847682119 			 0.20860927152317882
B-EVE       	 0.38125          		 0.30625          			 0.3125          
B-FOOD      	 0.3438985736925515 		 0.5213946117274167 			 0.1347068145800317
B-INST      	 0.5789473684210527 		 0.2631578947368421 			 0.15789473684210525
B-LOC       	 0.32884834663626 		 0.2725199543899658 			 0.3986316989737742
B-MEDIA     	 0.42178770949720673 		 0.39664804469273746 			 0.18156424581005587
B-MYTH      	 0.41935483870967744 		 0.2903225806451613 			 0.2903225806451613
B-ORG       	 0.4222078760490639 		 0.26081342801807617 			 0.31697869593285993
B-PER       	 0.4355951696377228 		 0.32259919493962047 			 0.2418056354226567
B-P