In [1]:
#Uncomment this cell if you have not already installed these libraries.
#!pip install -q seqeval
#!pip install -q transformers
#!pip install -q datasets
#!pip install -U accelerate
#!pip install -U transformers
#pip install torch torchvision torchaudio
#!pip install torchinfo
#!pip install transformers[torch] for GPU running.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [2]:
print("CUDA available:", torch.cuda.is_available())


CUDA available: False


In [3]:
def read_file(file_path):
    with open(file_path, "r",encoding="utf8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [4]:
train_data = read_file("train_en.tsv")
validation_data = read_file("dev_en.tsv")
test_data = read_file("test_en.tsv")
#note test data has only 30 labels. Missing 'I-BIO' comparing to training and validation sets

In [5]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))

131280
16410
16454


In [6]:
#keep only file type of tags
List_New = ['PER', 'ORG', 'LOC', 'DIS', 'ANIM']
def Data_Processing(data):
    for i in range(len(data)):
        sentence = data[i]
        for j in range(len(sentence)):
            s= sentence[j]
            temp = s[2].split('-')
            if (len(temp)>=2): 
                if (temp[1] not in  List_New):
                    data[i][j][2]= 'O' 
    return data

train_dataset = Data_Processing(train_data)
validation_dataset = Data_Processing(validation_data)
test_dataset = Data_Processing(test_data)

In [7]:
#Select 10% to do this task because of limitation of time and memory. 2.5% for trainingset.
import random
random.seed(1234) #make sure training data has 31 labels as original data
train_data = random.sample(train_dataset, int(0.025*len(train_dataset)))

random.seed(30) #make sure test data has 30 labels as original data
test_data = random.sample(test_dataset, int(0.15*len(test_dataset)))
random.seed(2) #make sure validation data has 31 labels as original data
validation_data = random.sample(validation_dataset, int(0.15*len(validation_dataset)))

In [8]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))
#Ratio: 4:3:3 It is not a good ratio. However, with limitations of machine and time. It is good to learn and practice.

3282
2461
2468


In [9]:
def convert_to_dataset(data, label_map):
    formatted_data = {"id": [], "tokens": [], "ner_tags": []}
    i= 0
    for sentence in data:
        tokens = [token_data[1] for token_data in sentence]
        ner_tags = [label_map[token_data[2]] for token_data in sentence]
        formatted_data["id"].append(i)
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
        i+=1
    return Dataset.from_dict(formatted_data)

In [10]:
label_list = sorted(list(set([token_data[2] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}
label_list

['B-ANIM',
 'B-DIS',
 'B-LOC',
 'B-ORG',
 'B-PER',
 'I-ANIM',
 'I-DIS',
 'I-LOC',
 'I-ORG',
 'I-PER',
 'O']

In [11]:
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [12]:
en_datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

In [12]:
en_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3282
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2468
    })
})

In [13]:
#counting each labels on the data
Counting_Train = {}
Counting_Validations = {}
Counting_Test = {}
Counting_Sum = {}
for i in label_list:
    Counting_Train[i] = 0
    Counting_Validations[i] = 0
    Counting_Test[i] = 0
    Counting_Sum[i] = 0
for sentence in train_data:
    for token_data in sentence:
        Counting_Train[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in validation_data:
    for token_data in sentence:
        Counting_Validations[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in test_data:
    for token_data in sentence:
        Counting_Test[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

print('Ratio of each Label in \n\t\t training\t\t:\t\tValidation\t\t:\t\tTesting')
for i in label_list:
    print('{message: <10}'.format(message=i), ' \t','{message: <16}'.format(message=Counting_Train[i]/Counting_Sum[i]),'\t\t','{message: <16}'.format(message=Counting_Validations[i]/Counting_Sum[i]),
          '\t\t\t','{message: <16}'.format(message=Counting_Test[i]/Counting_Sum[i]))
    
print(Counting_Train)
print(Counting_Validations)
print(Counting_Test)


Ratio of each Label in 
		 training		:		Validation		:		Testing
B-ANIM      	 0.4748677248677249 		 0.19973544973544974 			 0.3253968253968254
B-DIS       	 0.37748344370860926 		 0.4139072847682119 			 0.20860927152317882
B-LOC       	 0.32884834663626 		 0.2725199543899658 			 0.3986316989737742
B-ORG       	 0.4222078760490639 		 0.26081342801807617 			 0.31697869593285993
B-PER       	 0.4355951696377228 		 0.32259919493962047 			 0.2418056354226567
I-ANIM      	 0.4166666666666667 		 0.19827586206896552 			 0.3850574712643678
I-DIS       	 0.4119170984455959 		 0.38860103626943004 			 0.19948186528497408
I-LOC       	 0.3083290222452147 		 0.26332126228660113 			 0.4283497154681842
I-ORG       	 0.40414258188824664 		 0.27167630057803466 			 0.3241811175337187
I-PER       	 0.42265267688996505 		 0.327683615819209 			 0.24966370729082593
O           	 0.3943983995427265 		 0.3188364377399754 			 0.2867651627172981
{'B-ANIM': 359, 'B-DIS': 228, 'B-LOC': 1442, 'B-ORG': 654, 'B-PER': 

# Tokenizer and Model

In [14]:
#model_name = "bert-base-cased"
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define Metrics and Tokenization

In [15]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions), #https://stackoverflow.com/questions/77143185/classification-report-in-ner-models-seqeval-vs-sickit-learn
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize Datasets and Set Training Arguments

In [16]:
tokenized_datasets = en_datasets.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/3282 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2468 [00:00<?, ? examples/s]

In [17]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Define Data Collator and Initialize

In [18]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [19]:
#because of limitations of times, and computer. I will select 2/8 training set for training. Randomlly.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()


  0%|          | 0/1030 [00:00<?, ?it/s]

{'loss': 0.2271, 'learning_rate': 4.514563106796117e-05, 'epoch': 0.97}
{'loss': 0.0345, 'learning_rate': 4.029126213592233e-05, 'epoch': 1.94}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 0.044075388461351395, 'eval_precision': 0.9079301075268817, 'eval_recall': 0.8654708520179372, 'eval_f1': 0.8861921941620203, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.50      0.17      0.25       151\n         DIS       0.48      0.34      0.40       250\n         LOC       0.94      0.96      0.95      1195\n         ORG       0.87      0.86      0.86       404\n         PER       0.98      0.98      0.98      1122\n\n   micro avg       0.91      0.87      0.89      3122\n   macro avg       0.75      0.66      0.69      3122\nweighted avg       0.88      0.87      0.87      3122\n', 'eval_runtime': 127.3002, 'eval_samples_per_second': 19.332, 'eval_steps_per_second': 0.605, 'epoch': 1.94}
{'loss': 0.0187, 'learning_rate': 3.54368932038835e-05, 'epoch': 2.91}
{'loss': 0.0089, 'learning_rate': 3.058252427184466e-05, 'epoch': 3.88}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 0.052464861422777176, 'eval_precision': 0.9190256747860435, 'eval_recall': 0.8942985265855221, 'eval_f1': 0.9064935064935065, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.53      0.38      0.44       151\n         DIS       0.56      0.43      0.49       250\n         LOC       0.96      0.98      0.97      1195\n         ORG       0.91      0.88      0.90       404\n         PER       0.98      0.98      0.98      1122\n\n   micro avg       0.92      0.89      0.91      3122\n   macro avg       0.79      0.73      0.75      3122\nweighted avg       0.91      0.89      0.90      3122\n', 'eval_runtime': 125.5202, 'eval_samples_per_second': 19.606, 'eval_steps_per_second': 0.613, 'epoch': 3.88}
{'loss': 0.0047, 'learning_rate': 2.5728155339805826e-05, 'epoch': 4.85}
{'loss': 0.0026, 'learning_rate': 2.0873786407766992e-05, 'epoch': 5.83}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 0.06062619015574455, 'eval_precision': 0.8928346951602766, 'eval_recall': 0.909993593850096, 'eval_f1': 0.9013324873096447, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.44      0.48      0.46       151\n         DIS       0.55      0.55      0.55       250\n         LOC       0.96      0.98      0.97      1195\n         ORG       0.88      0.89      0.88       404\n         PER       0.97      0.98      0.98      1122\n\n   micro avg       0.89      0.91      0.90      3122\n   macro avg       0.76      0.78      0.77      3122\nweighted avg       0.89      0.91      0.90      3122\n', 'eval_runtime': 133.8121, 'eval_samples_per_second': 18.391, 'eval_steps_per_second': 0.575, 'epoch': 5.83}
{'loss': 0.0017, 'learning_rate': 1.6019417475728158e-05, 'epoch': 6.8}
{'loss': 0.0011, 'learning_rate': 1.116504854368932e-05, 'epoch': 7.77}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 0.06459921598434448, 'eval_precision': 0.904, 'eval_recall': 0.9048686739269699, 'eval_f1': 0.9044341283816232, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.53      0.43      0.48       151\n         DIS       0.55      0.52      0.53       250\n         LOC       0.96      0.98      0.97      1195\n         ORG       0.86      0.89      0.88       404\n         PER       0.97      0.98      0.98      1122\n\n   micro avg       0.90      0.90      0.90      3122\n   macro avg       0.78      0.76      0.77      3122\nweighted avg       0.90      0.90      0.90      3122\n', 'eval_runtime': 126.9143, 'eval_samples_per_second': 19.391, 'eval_steps_per_second': 0.607, 'epoch': 7.77}
{'loss': 0.0007, 'learning_rate': 6.310679611650486e-06, 'epoch': 8.74}
{'loss': 0.0006, 'learning_rate': 1.4563106796116506e-06, 'epoch': 9.71}


  0%|          | 0/77 [00:00<?, ?it/s]

{'eval_loss': 0.06619025766849518, 'eval_precision': 0.9017175572519084, 'eval_recall': 0.9080717488789237, 'eval_f1': 0.9048834982444941, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.50      0.41      0.45       151\n         DIS       0.56      0.56      0.56       250\n         LOC       0.96      0.97      0.97      1195\n         ORG       0.86      0.91      0.88       404\n         PER       0.98      0.98      0.98      1122\n\n   micro avg       0.90      0.91      0.90      3122\n   macro avg       0.77      0.77      0.77      3122\nweighted avg       0.90      0.91      0.90      3122\n', 'eval_runtime': 128.4122, 'eval_samples_per_second': 19.165, 'eval_steps_per_second': 0.6, 'epoch': 9.71}
{'train_runtime': 5557.2894, 'train_samples_per_second': 5.906, 'train_steps_per_second': 0.185, 'train_loss': 0.029196617774014333, 'epoch': 10.0}


TrainOutput(global_step=1030, training_loss=0.029196617774014333, metrics={'train_runtime': 5557.2894, 'train_samples_per_second': 5.906, 'train_steps_per_second': 0.185, 'train_loss': 0.029196617774014333, 'epoch': 10.0})

In [21]:
validation_results  = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])

  0%|          | 0/77 [00:00<?, ?it/s]

In [22]:
test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])

  0%|          | 0/78 [00:00<?, ?it/s]

In [23]:
#print(test_results['eval_classification_report'])

In [24]:
train_results = trainer.evaluate(eval_dataset=tokenized_datasets['train'])

  0%|          | 0/103 [00:00<?, ?it/s]

In [25]:
#!pip install torchinfo
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForTokenClassification                        --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           8,459
Total params: 65,199,371
Trainable params: 65,199,371
Non-trainable params: 0

In [26]:
print(train_results['eval_classification_report'])
print(validation_results['eval_classification_report'])
print(test_results['eval_classification_report'])

              precision    recall  f1-score   support

        ANIM       0.97      0.95      0.96       359
         DIS       0.92      0.88      0.90       228
         LOC       1.00      1.00      1.00      1442
         ORG       1.00      1.00      1.00       654
         PER       1.00      1.00      1.00      1515

   micro avg       0.99      0.99      0.99      4198
   macro avg       0.98      0.97      0.97      4198
weighted avg       0.99      0.99      0.99      4198

              precision    recall  f1-score   support

        ANIM       0.53      0.38      0.44       151
         DIS       0.56      0.43      0.49       250
         LOC       0.96      0.98      0.97      1195
         ORG       0.91      0.88      0.90       404
         PER       0.98      0.98      0.98      1122

   micro avg       0.92      0.89      0.91      3122
   macro avg       0.79      0.73      0.75      3122
weighted avg       0.91      0.89      0.90      3122

              precisio

In [27]:
trainer.save_model("Models\\Best model distillbert after 10 epochs - SystemB")