# Import Library

In [1]:
import os
import json
import torch
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from datasets import load_metric, load_dataset, Dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DRIVER_LETTER = os.path.splitdrive(os.getcwd())[0]

## Configurations

In [3]:
FURTHER_PRETRAINED_FOLDER = "bert-base-cased"
logging_dir = "./logs_final"
output_dir = "./pretrained_bert_final"

In [4]:
# Define hyperparameter grid
with open("best-parameter.json","r") as f:
    hyperparameters = json.load(f)
print(hyperparameters)

{'learning_rate': 5e-05, 'num_train_epochs': 20, 'per_device_train_batch_size': 4, 'weight_decay': 0.0001, 'assignments': {}, 'metric': 'eval/loss'}


In [6]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-cased")

In [7]:
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [8]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Import Dataset

In [10]:
train = pd.read_json("./bio-tag-dataset/train_bio.json")
train = train[["text","entities"]]
train["token"] = train["text"].apply(lambda x: word_tokenize(x))

test = pd.read_json("./bio-tag-dataset/test_bio.json")
test = test[["text","entities"]]
test["token"] = test["text"].apply(lambda x: word_tokenize(x))

valid = pd.read_json("./bio-tag-dataset/validation_bio.json")
valid = valid[["text","entities"]]
valid["token"] = valid["text"].apply(lambda x: word_tokenize(x))

print(train.shape)
print(test.shape)
print(valid.shape)

(150, 3)
(20, 3)
(30, 3)


# Format BIO Dataset - Processing

In [11]:
# Split labels based on whitespace and turn them into a list
labels = [i for i in train['entities'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

unique_labels = list(unique_labels)
print(unique_labels)

# Map each label into its id representation and vice versa
label2id = {k: v for v, k in enumerate(sorted(unique_labels))}
id2label  = {v: k for v, k in enumerate(sorted(unique_labels))}
print(label2id)

['I-PERSON', 'B-WORK_OF_ART', 'O', 'B-PERSON', 'B-ORGANIZATION', 'B-ROLE', 'I-NORP', 'B-EVENT', 'I-LAW', 'B-FACILITY', 'B-TITLE', 'I-EVENT', 'I-TITLE', 'B-LOCATION', 'B-PRODUCT', 'B-LAW', 'B-NORP', 'I-PRODUCT', 'I-FACILITY', 'I-ORGANIZATION', 'I-WORK_OF_ART', 'I-ROLE', 'I-LOCATION']
{'B-EVENT': 0, 'B-FACILITY': 1, 'B-LAW': 2, 'B-LOCATION': 3, 'B-NORP': 4, 'B-ORGANIZATION': 5, 'B-PERSON': 6, 'B-PRODUCT': 7, 'B-ROLE': 8, 'B-TITLE': 9, 'B-WORK_OF_ART': 10, 'I-EVENT': 11, 'I-FACILITY': 12, 'I-LAW': 13, 'I-LOCATION': 14, 'I-NORP': 15, 'I-ORGANIZATION': 16, 'I-PERSON': 17, 'I-PRODUCT': 18, 'I-ROLE': 19, 'I-TITLE': 20, 'I-WORK_OF_ART': 21, 'O': 22}


In [12]:
train['entities'] = train['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
test['entities'] = test['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
valid['entities'] = valid['entities'].apply(lambda entities: [label2id[ent] for ent in entities])

In [13]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
valid_dataset = Dataset.from_pandas(valid)

# Tokenize Dataset

In [14]:
label_all_tokens = True

In [15]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["token"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [16]:
training_set = train_dataset.map(tokenize_and_align_labels, batched=True)
testing_set = test_dataset.map(tokenize_and_align_labels, batched=True)
valid_set = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 150/150 [00:00<00:00, 748.15 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [00:00<00:00, 802.11 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:00<00:00, 833.23 examples/s]


# Set an Initialize BERT Model

In [17]:
# model = BertForTokenClassification.from_pretrained('bert-base-cased', 
#                                                    num_labels=len(id2label),
#                                                    id2label=id2label,
#                                                    label2id=label2id)
def model_init():
    return(BertForTokenClassification.from_pretrained("bert-base-cased", num_labels=len(id2label), id2label=id2label, label2id=label2id))

In [18]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=hyperparameters["num_train_epochs"],
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
    per_device_eval_batch_size=hyperparameters["per_device_train_batch_size"],
    save_total_limit=2,
    logging_dir=logging_dir,
    learning_rate=hyperparameters["learning_rate"],
    weight_decay=hyperparameters['weight_decay'],
    logging_first_step=True,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy="epoch", 
    resume_from_checkpoint=True,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adafactor",
    metric_for_best_model="eval_f1",
    report_to="wandb",
    #optim="adamw_bnb_8bit",
    run_name=FURTHER_PRETRAINED_FOLDER,
    load_best_model_at_end=True,
    disable_tqdm=False  # Disable tqdm progress bar if desired
)

In [19]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [20]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [21]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    label2id_str = list(label2id.keys())
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [22]:
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=training_set,
    eval_dataset=valid_set,
    model_init=model_init,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [23]:
%%time

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-cased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohanrj-nlp[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,1.5339,0.703811,0.335907,0.287603,0.309884,0.853895
2,0.4657,0.291635,0.511867,0.534711,0.52304,0.91787
4,0.2927,0.204336,0.62473,0.718182,0.668205,0.941904
6,0.2046,0.195381,0.68425,0.761157,0.720657,0.946053
8,0.1447,0.175231,0.707951,0.765289,0.735504,0.952019
10,0.1119,0.18303,0.710835,0.802479,0.753882,0.951846
12,0.0901,0.189923,0.727829,0.786777,0.756156,0.951846
14,0.0756,0.186401,0.718126,0.772727,0.744427,0.952192
16,0.0682,0.188509,0.738426,0.790909,0.763767,0.953921
18,0.0588,0.193275,0.734411,0.78843,0.760462,0.953575


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CPU times: total: 2min 14s
Wall time: 2min 17s


TrainOutput(global_step=180, training_loss=0.25365321073267194, metrics={'train_runtime': 135.0179, 'train_samples_per_second': 22.219, 'train_steps_per_second': 1.333, 'total_flos': 731942355536436.0, 'train_loss': 0.25365321073267194, 'epoch': 18.95})

# Evaluation

In [27]:
trainer.evaluate()

{'eval_loss': 0.18850915133953094,
 'eval_precision': 0.7384259259259259,
 'eval_recall': 0.7909090909090909,
 'eval_f1': 0.7637669592976856,
 'eval_accuracy': 0.9539206362929022,
 'eval_runtime': 0.7347,
 'eval_samples_per_second': 40.832,
 'eval_steps_per_second': 10.888,
 'epoch': 18.95}

In [28]:
predictions, labels, _ = trainer.predict(valid_set)
predictions = np.argmax(predictions, axis=2)

label2id_str = list(label2id.keys())
# Remove ignored index (special tokens)
true_predictions = [
    [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'EVENT': {'precision': 0.609375,
  'recall': 0.75,
  'f1': 0.6724137931034483,
  'number': 52},
 'FACILITY': {'precision': 0.22857142857142856,
  'recall': 0.25,
  'f1': 0.23880597014925375,
  'number': 32},
 'LOCATION': {'precision': 0.8771929824561403,
  'recall': 0.847457627118644,
  'f1': 0.8620689655172413,
  'number': 295},
 'NORP': {'precision': 1.0,
  'recall': 0.8,
  'f1': 0.888888888888889,
  'number': 5},
 'ORGANIZATION': {'precision': 0.8048780487804879,
  'recall': 0.8208955223880597,
  'f1': 0.8128078817733989,
  'number': 603},
 'PERSON': {'precision': 0.6907216494845361,
  'recall': 0.8072289156626506,
  'f1': 0.7444444444444445,
  'number': 166},
 'PRODUCT': {'precision': 0.13725490196078433,
  'recall': 0.875,
  'f1': 0.23728813559322037,
  'number': 8},
 'ROLE': {'precision': 0.4146341463414634,
  'recall': 0.37777777777777777,
  'f1': 0.39534883720930236,
  'number': 45},
 'TITLE': {'precision': 0.42857142857142855,
  'recall': 0.75,
  'f1': 0.5454545454545454,
  '

In [29]:
os.makedirs(f"./final_model", exist_ok=True)
final_model = f"./final_model"
os.makedirs(final_model, exist_ok=True)

trainer.save_model(final_model)

os.makedirs(f"./final_tokenizer", exist_ok=True)
final_tokenizer = f"./final_tokenizer"
os.makedirs(final_tokenizer, exist_ok=True)
tokenizer.save_pretrained(final_tokenizer)

('./final_tokenizer\\tokenizer_config.json',
 './final_tokenizer\\special_tokens_map.json',
 './final_tokenizer\\vocab.txt',
 './final_tokenizer\\added_tokens.json',
 './final_tokenizer\\tokenizer.json')