In [1]:
#Uncomment this cell if you have not already installed these libraries.
#!pip install -q seqeval
#!pip install -q transformers
#!pip install -q datasets
#!pip install -U accelerate
#!pip install -U transformers
#pip install torch torchvision torchaudio
#!pip install torchinfo
#!pip install transformers[torch] for GPU running.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, Trainer, TrainingArguments
from datasets import load_dataset, load_metric, Dataset, DatasetDict
import numpy as np
from seqeval.metrics import f1_score, precision_score, recall_score, classification_report

In [3]:
def read_file(file_path):
    with open(file_path, "r",encoding="utf8") as f:
        content = f.read().strip()
        sentences = content.split("\n\n")
        data = []
        for sentence in sentences:
            tokens = sentence.split("\n")
            token_data = []
            for token in tokens:
                token_data.append(token.split())
            data.append(token_data)
    return data

In [4]:
train_data = read_file("train_en.tsv")
validation_data = read_file("dev_en.tsv")
test_data = read_file("test_en.tsv")
#note test data has only 30 labels. Missing 'I-BIO' comparing to training and validation sets

In [5]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))

131280
16410
16454


In [6]:
#Select 10% to do this task because of limitation of time and memory. 2.5% for trainingset.
import random
random.seed(1234) #make sure training data has 31 labels as original data
train_data = random.sample(train_data, int(0.025*len(train_data)))

random.seed(30) #make sure test data has 30 labels as original data
test_data = random.sample(test_data, int(0.15*len(test_data)))
random.seed(2) #make sure validation data has 31 labels as original data
validation_data = random.sample(validation_data, int(0.15*len(validation_data)))

In [7]:
print(len(train_data))
print(len(validation_data))
print(len(test_data))
#Ratio: 4:3:3 It is not a good ratio. However, with limitations of machine and time. It is good to learn and practice.

3282
2461
2468


In [8]:
def convert_to_dataset(data, label_map):
    formatted_data = {"id": [], "tokens": [], "ner_tags": []}
    i= 0
    for sentence in data:
        tokens = [token_data[1] for token_data in sentence]
        ner_tags = [label_map[token_data[2]] for token_data in sentence]
        formatted_data["id"].append(i)
        formatted_data["tokens"].append(tokens)
        formatted_data["ner_tags"].append(ner_tags)
        i+=1
    return Dataset.from_dict(formatted_data)

In [9]:
label_list = sorted(list(set([token_data[2] for sentence in train_data for token_data in sentence])))
label_map = {label: i for i, label in enumerate(label_list)}
label_list

['B-ANIM',
 'B-BIO',
 'B-CEL',
 'B-DIS',
 'B-EVE',
 'B-FOOD',
 'B-INST',
 'B-LOC',
 'B-MEDIA',
 'B-MYTH',
 'B-ORG',
 'B-PER',
 'B-PLANT',
 'B-TIME',
 'B-VEHI',
 'I-ANIM',
 'I-BIO',
 'I-CEL',
 'I-DIS',
 'I-EVE',
 'I-FOOD',
 'I-INST',
 'I-LOC',
 'I-MEDIA',
 'I-MYTH',
 'I-ORG',
 'I-PER',
 'I-PLANT',
 'I-TIME',
 'I-VEHI',
 'O']

In [10]:
train_dataset = convert_to_dataset(train_data, label_map)
validation_dataset = convert_to_dataset(validation_data, label_map)
test_dataset = convert_to_dataset(test_data, label_map)

In [11]:
#counting each labels on the data
Counting_Train = {}
Counting_Validations = {}
Counting_Test = {}
Counting_Sum = {}
for i in label_list:
    Counting_Train[i] = 0
    Counting_Validations[i] = 0
    Counting_Test[i] = 0
    Counting_Sum[i] = 0
for sentence in train_data:
    for token_data in sentence:
        Counting_Train[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in validation_data:
    for token_data in sentence:
        Counting_Validations[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

for sentence in test_data:
    for token_data in sentence:
        Counting_Test[token_data[2]] +=1
        Counting_Sum[token_data[2]] +=1

print('Ratio of each Label in \n\t\t training\t\t:\t\tValidation\t\t:\t\tTesting')
for i in label_list:
    print('{message: <10}'.format(message=i), ' \t','{message: <16}'.format(message=Counting_Train[i]/Counting_Sum[i]),'\t\t','{message: <16}'.format(message=Counting_Validations[i]/Counting_Sum[i]),
          '\t\t\t','{message: <16}'.format(message=Counting_Test[i]/Counting_Sum[i]))
    
print(Counting_Train)
print(Counting_Validations)
print(Counting_Test)

Ratio of each Label in 
		 training		:		Validation		:		Testing
B-ANIM      	 0.4748677248677249 		 0.19973544973544974 			 0.3253968253968254
B-BIO       	 0.5              		 0.4              			 0.1             
B-CEL       	 0.6716417910447762 		 0.2835820895522388 			 0.04477611940298507
B-DIS       	 0.37748344370860926 		 0.4139072847682119 			 0.20860927152317882
B-EVE       	 0.38125          		 0.30625          			 0.3125          
B-FOOD      	 0.3438985736925515 		 0.5213946117274167 			 0.1347068145800317
B-INST      	 0.5789473684210527 		 0.2631578947368421 			 0.15789473684210525
B-LOC       	 0.32884834663626 		 0.2725199543899658 			 0.3986316989737742
B-MEDIA     	 0.42178770949720673 		 0.39664804469273746 			 0.18156424581005587
B-MYTH      	 0.41935483870967744 		 0.2903225806451613 			 0.2903225806451613
B-ORG       	 0.4222078760490639 		 0.26081342801807617 			 0.31697869593285993
B-PER       	 0.4355951696377228 		 0.32259919493962047 			 0.2418056354226567
B-P

In [12]:
en_datasets = DatasetDict({
    "train": train_dataset,
    "validation": validation_dataset,
    "test": test_dataset,
})

In [13]:
en_datasets

DatasetDict({
    train: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 3282
    })
    validation: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2461
    })
    test: Dataset({
        features: ['id', 'tokens', 'ner_tags'],
        num_rows: 2468
    })
})

# Tokenizer and Model

In [14]:
#model_name = "bert-base-cased"
model_name = "distilbert-base-cased"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name, num_labels=len(label_list))

Some weights of DistilBertForTokenClassification were not initialized from the model checkpoint at distilbert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define Metrics and Tokenization

In [15]:
def compute_metrics(eval_prediction):
    predictions, labels = eval_prediction
    predictions = np.argmax(predictions, axis=2)


    # Remove ignored index (special tokens)
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label_list[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]


    return {
        "precision": precision_score(true_labels, true_predictions),
        "recall": recall_score(true_labels, true_predictions),
        "f1": f1_score(true_labels, true_predictions),
        "classification_report": classification_report(true_labels, true_predictions), #https://stackoverflow.com/questions/77143185/classification-report-in-ner-models-seqeval-vs-sickit-learn
    }


def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples["tokens"], truncation=True, is_split_into_words=True, padding=True
    )
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(-100)
            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Tokenize Datasets and Set Training Arguments

In [16]:
tokenized_datasets = en_datasets.map(tokenize_and_align_labels, batched=True)


Map:   0%|          | 0/3282 [00:00<?, ? examples/s]

Map:   0%|          | 0/2461 [00:00<?, ? examples/s]

Map:   0%|          | 0/2468 [00:00<?, ? examples/s]

In [17]:

training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="steps",
    eval_steps=200,
    save_steps=200,
    num_train_epochs=10,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    logging_steps=100,
    learning_rate=5e-5,
    load_best_model_at_end=True,
    metric_for_best_model="f1",
)

# Define Data Collator and Initialize

In [18]:
def data_collator(data):
    input_ids = [torch.tensor(item["input_ids"]) for item in data]
    attention_mask = [torch.tensor(item["attention_mask"]) for item in data]
    labels = [torch.tensor(item["labels"]) for item in data]


    input_ids = torch.nn.utils.rnn.pad_sequence(input_ids, batch_first=True, padding_value=tokenizer.pad_token_id)
    attention_mask = torch.nn.utils.rnn.pad_sequence(attention_mask, batch_first=True, padding_value=0)
    labels = torch.nn.utils.rnn.pad_sequence(labels, batch_first=True, padding_value=-100)


    return {
        "input_ids": input_ids,
        "attention_mask": attention_mask,
        "labels": labels,
    }

In [19]:
#because of limitations of times, and computer. I will select 2/8 training set for training. Randomlly.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_datasets["train"],
    eval_dataset=tokenized_datasets["validation"],
    data_collator=data_collator,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)

In [20]:
trainer.train()


  0%|          | 0/1030 [00:00<?, ?it/s]

{'loss': 0.3692, 'learning_rate': 4.514563106796117e-05, 'epoch': 0.97}
{'loss': 0.0742, 'learning_rate': 4.029126213592233e-05, 'epoch': 1.94}


  0%|          | 0/77 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.08763223886489868, 'eval_precision': 0.8865039694207586, 'eval_recall': 0.771889400921659, 'eval_f1': 0.8252360749965787, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.40      0.32      0.36       151\n         BIO       0.00      0.00      0.00         4\n         CEL       0.86      0.63      0.73        19\n         DIS       0.61      0.28      0.38       250\n         EVE       0.71      0.94      0.81        49\n        FOOD       0.61      0.17      0.27       329\n        INST       0.00      0.00      0.00         5\n         LOC       0.96      0.97      0.97      1195\n       MEDIA       0.85      0.93      0.89       142\n        MYTH       0.00      0.00      0.00         9\n         ORG       0.84      0.91      0.88       404\n         PER       0.98      0.97      0.98      1122\n       PLANT       0.38      0.12      0.18       169\n        TIME       0.58      0.17      0.26        42\n    

  0%|          | 0/77 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.09912322461605072, 'eval_precision': 0.8597014925373134, 'eval_recall': 0.8110599078341014, 'eval_f1': 0.8346726386510341, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.51      0.40      0.45       151\n         BIO       0.00      0.00      0.00         4\n         CEL       1.00      0.68      0.81        19\n         DIS       0.55      0.49      0.52       250\n         EVE       0.73      0.92      0.81        49\n        FOOD       0.40      0.29      0.34       329\n        INST       0.50      0.20      0.29         5\n         LOC       0.96      0.98      0.97      1195\n       MEDIA       0.91      0.91      0.91       142\n        MYTH       1.00      0.11      0.20         9\n         ORG       0.89      0.91      0.90       404\n         PER       0.98      0.98      0.98      1122\n       PLANT       0.48      0.25      0.33       169\n        TIME       0.57      0.31      0.40        42\n   

  0%|          | 0/77 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.1102716401219368, 'eval_precision': 0.8519404572036151, 'eval_recall': 0.8205325140809012, 'eval_f1': 0.8359415753781951, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.50      0.38      0.43       151\n         BIO       0.00      0.00      0.00         4\n         CEL       0.75      0.63      0.69        19\n         DIS       0.61      0.42      0.50       250\n         EVE       0.74      0.92      0.82        49\n        FOOD       0.44      0.38      0.41       329\n        INST       0.20      0.40      0.27         5\n         LOC       0.97      0.98      0.97      1195\n       MEDIA       0.90      0.92      0.91       142\n        MYTH       1.00      0.33      0.50         9\n         ORG       0.89      0.90      0.89       404\n         PER       0.98      0.98      0.98      1122\n       PLANT       0.41      0.40      0.41       169\n        TIME       0.56      0.33      0.42        42\n    

  0%|          | 0/77 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.11784430593252182, 'eval_precision': 0.8343949044585988, 'eval_recall': 0.8384536610343062, 'eval_f1': 0.8364193589579875, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.49      0.46      0.47       151\n         BIO       0.00      0.00      0.00         4\n         CEL       0.61      0.58      0.59        19\n         DIS       0.57      0.60      0.58       250\n         EVE       0.83      0.92      0.87        49\n        FOOD       0.43      0.39      0.41       329\n        INST       0.33      0.40      0.36         5\n         LOC       0.97      0.97      0.97      1195\n       MEDIA       0.88      0.92      0.90       142\n        MYTH       1.00      0.33      0.50         9\n         ORG       0.89      0.91      0.90       404\n         PER       0.97      0.98      0.98      1122\n       PLANT       0.39      0.44      0.41       169\n        TIME       0.50      0.33      0.40        42\n   

  0%|          | 0/77 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.1271553784608841, 'eval_precision': 0.8390834191555098, 'eval_recall': 0.8343573988735279, 'eval_f1': 0.8367137355584081, 'eval_classification_report': '              precision    recall  f1-score   support\n\n        ANIM       0.48      0.46      0.47       151\n         BIO       0.00      0.00      0.00         4\n         CEL       0.65      0.58      0.61        19\n         DIS       0.56      0.58      0.57       250\n         EVE       0.84      0.94      0.88        49\n        FOOD       0.42      0.38      0.40       329\n        INST       0.40      0.40      0.40         5\n         LOC       0.97      0.97      0.97      1195\n       MEDIA       0.90      0.92      0.91       142\n        MYTH       1.00      0.33      0.50         9\n         ORG       0.89      0.92      0.90       404\n         PER       0.97      0.98      0.98      1122\n       PLANT       0.40      0.39      0.39       169\n        TIME       0.56      0.33      0.42        42\n    

TrainOutput(global_step=1030, training_loss=0.05362762269898526, metrics={'train_runtime': 5619.6813, 'train_samples_per_second': 5.84, 'train_steps_per_second': 0.183, 'train_loss': 0.05362762269898526, 'epoch': 10.0})

In [21]:
validation_results  = trainer.evaluate(eval_dataset=tokenized_datasets['validation'])

  0%|          | 0/77 [00:00<?, ?it/s]

  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
test_results = trainer.evaluate(eval_dataset=tokenized_datasets['test'])

  0%|          | 0/78 [00:00<?, ?it/s]

In [23]:
#print(test_results['eval_classification_report'])

In [24]:
train_results = trainer.evaluate(eval_dataset=tokenized_datasets['train'])

  0%|          | 0/103 [00:00<?, ?it/s]

In [25]:
#!pip install torchinfo
from torchinfo import summary
summary(model)

Layer (type:depth-idx)                                  Param #
DistilBertForTokenClassification                        --
├─DistilBertModel: 1-1                                  --
│    └─Embeddings: 2-1                                  --
│    │    └─Embedding: 3-1                              22,268,928
│    │    └─Embedding: 3-2                              393,216
│    │    └─LayerNorm: 3-3                              1,536
│    │    └─Dropout: 3-4                                --
│    └─Transformer: 2-2                                 --
│    │    └─ModuleList: 3-5                             42,527,232
├─Dropout: 1-2                                          --
├─Linear: 1-3                                           23,839
Total params: 65,214,751
Trainable params: 65,214,751
Non-trainable params: 0

In [26]:
print(train_results['eval_classification_report'])
print(validation_results['eval_classification_report'])
print(test_results['eval_classification_report'])

              precision    recall  f1-score   support

        ANIM       0.99      0.99      0.99       359
         BIO       0.67      0.40      0.50         5
         CEL       1.00      1.00      1.00        45
         DIS       1.00      1.00      1.00       228
         EVE       1.00      1.00      1.00        61
        FOOD       1.00      1.00      1.00       217
        INST       1.00      0.91      0.95        11
         LOC       1.00      1.00      1.00      1442
       MEDIA       0.99      1.00      1.00       151
        MYTH       1.00      1.00      1.00        13
         ORG       1.00      1.00      1.00       654
         PER       1.00      1.00      1.00      1515
       PLANT       0.99      1.00      0.99       217
        TIME       1.00      0.99      0.99        70
        VEHI       1.00      1.00      1.00        14

   micro avg       1.00      1.00      1.00      5002
   macro avg       0.98      0.95      0.96      5002
weighted avg       1.00   

In [27]:
trainer.save_model("Models\\Best model distillbert after 10 epochs - SystemA")