# Import Library

In [1]:
import os
import json
import torch
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from datasets import load_metric, load_dataset, Dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DRIVER_LETTER = os.path.splitdrive(os.getcwd())[0]

## Configurations

In [3]:
FURTHER_PRETRAINED_FOLDER = "mbert_base_with_mbert_base_tokenizer"
logging_dir = "./logs_final"
output_dir = "./pretrained_bert_final"

In [4]:
# Define hyperparameter grid
with open("best-parameter.json","r") as f:
    hyperparameters = json.load(f)
print(hyperparameters)

{'learning_rate': 5e-05, 'num_train_epochs': 30, 'per_device_train_batch_size': 4, 'weight_decay': 0.01, 'assignments': {}, 'metric': 'eval/loss'}


In [5]:
FP_DIR = "../../pretrained-lm/MENmBERT-FP"

In [7]:
FP_DIR

'C:/Users/mcha0133/Desktop/Everything About BERT Training/Further Pretraining BERT/mbert_base_with_mbert_base_tokenizer/epoch-optimized'

In [8]:
tokenizer = BertTokenizerFast.from_pretrained(f"{FP_DIR}/final_vocab")

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [9]:
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [10]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Import Dataset

In [12]:
train = pd.read_json("./bio-tag-dataset/train_bio.json")
train = train[["text","entities"]]
train["token"] = train["text"].apply(lambda x: word_tokenize(x))

test = pd.read_json("./bio-tag-dataset/test_bio.json")
test = test[["text","entities"]]
test["token"] = test["text"].apply(lambda x: word_tokenize(x))

valid = pd.read_json("./bio-tag-dataset/validation_bio.json")
valid = valid[["text","entities"]]
valid["token"] = valid["text"].apply(lambda x: word_tokenize(x))

print(train.shape)
print(test.shape)
print(valid.shape)

(150, 3)
(20, 3)
(30, 3)


# Format BIO Dataset - Processing

In [13]:
# Split labels based on whitespace and turn them into a list
labels = [i for i in train['entities'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

unique_labels = list(unique_labels)
print(unique_labels)

# Map each label into its id representation and vice versa
label2id = {k: v for v, k in enumerate(sorted(unique_labels))}
id2label  = {v: k for v, k in enumerate(sorted(unique_labels))}
print(label2id)

['B-WORK_OF_ART', 'I-PRODUCT', 'I-FACILITY', 'B-PRODUCT', 'B-ORGANIZATION', 'O', 'B-EVENT', 'I-TITLE', 'B-TITLE', 'B-LAW', 'I-EVENT', 'B-ROLE', 'I-PERSON', 'I-NORP', 'I-ROLE', 'B-FACILITY', 'I-LOCATION', 'I-LAW', 'I-ORGANIZATION', 'B-PERSON', 'I-WORK_OF_ART', 'B-LOCATION', 'B-NORP']
{'B-EVENT': 0, 'B-FACILITY': 1, 'B-LAW': 2, 'B-LOCATION': 3, 'B-NORP': 4, 'B-ORGANIZATION': 5, 'B-PERSON': 6, 'B-PRODUCT': 7, 'B-ROLE': 8, 'B-TITLE': 9, 'B-WORK_OF_ART': 10, 'I-EVENT': 11, 'I-FACILITY': 12, 'I-LAW': 13, 'I-LOCATION': 14, 'I-NORP': 15, 'I-ORGANIZATION': 16, 'I-PERSON': 17, 'I-PRODUCT': 18, 'I-ROLE': 19, 'I-TITLE': 20, 'I-WORK_OF_ART': 21, 'O': 22}


In [14]:
train['entities'] = train['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
test['entities'] = test['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
valid['entities'] = valid['entities'].apply(lambda entities: [label2id[ent] for ent in entities])

In [15]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
valid_dataset = Dataset.from_pandas(valid)

# Tokenize Dataset

In [16]:
label_all_tokens = True

In [17]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["token"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [18]:
training_set = train_dataset.map(tokenize_and_align_labels, batched=True)
testing_set = test_dataset.map(tokenize_and_align_labels, batched=True)
valid_set = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 150/150 [00:00<00:00, 760.47 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 20/20 [00:00<00:00, 1137.38 examples/s]
Map: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 30/30 [00:00<00:00, 803.00 examples/s]


# Set an Initialize BERT Model

In [19]:
# model = BertForTokenClassification.from_pretrained('bert-base-cased', 
#                                                    num_labels=len(id2label),
#                                                    id2label=id2label,
#                                                    label2id=label2id)
def model_init():
    return(BertForTokenClassification.from_pretrained(f"{FP_DIR}/final_bert", num_labels=len(id2label), id2label=id2label, label2id=label2id))

In [20]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=hyperparameters["num_train_epochs"],
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
    per_device_eval_batch_size=hyperparameters["per_device_train_batch_size"],
    save_total_limit=2,
    logging_dir=logging_dir,
    learning_rate=hyperparameters["learning_rate"],
    weight_decay=hyperparameters['weight_decay'],
    logging_first_step=True,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy="epoch", 
    resume_from_checkpoint=True,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adafactor",
    metric_for_best_model="eval_f1",
    report_to="wandb",
    #optim="adamw_bnb_8bit",
    run_name=FURTHER_PRETRAINED_FOLDER,
    load_best_model_at_end=True,
    disable_tqdm=False  # Disable tqdm progress bar if desired
)

In [21]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [22]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [23]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    label2id_str = list(label2id.keys())
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [24]:
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=training_set,
    eval_dataset=valid_set,
    model_init=model_init,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at C:/Users/mcha0133/Desktop/Everything About BERT Training/Further Pretraining BERT/mbert_base_with_mbert_base_tokenizer/epoch-optimized/final_bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [25]:
%%time

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at C:/Users/mcha0133/Desktop/Everything About BERT Training/Further Pretraining BERT/mbert_base_with_mbert_base_tokenizer/epoch-optimized/final_bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohanrj-nlp[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,1.0487,0.386221,0.420221,0.495345,0.454701,0.913658
2,0.3009,0.179233,0.740038,0.726257,0.733083,0.95774
4,0.1845,0.134892,0.798007,0.820298,0.808999,0.967199
6,0.1121,0.122492,0.837009,0.812849,0.824752,0.971104
8,0.0779,0.122717,0.798034,0.831471,0.814409,0.967025
10,0.0566,0.131641,0.791917,0.857542,0.823424,0.966158
12,0.0428,0.145943,0.776839,0.85568,0.814355,0.964075
14,0.0324,0.156886,0.798796,0.864991,0.830577,0.964249
16,0.0263,0.1569,0.789474,0.865922,0.825933,0.964422
18,0.0198,0.162455,0.790558,0.857542,0.822689,0.964856


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CPU times: total: 3min 34s
Wall time: 3min 41s


TrainOutput(global_step=270, training_loss=0.1121553600386337, metrics={'train_runtime': 218.9165, 'train_samples_per_second': 20.556, 'train_steps_per_second': 1.233, 'total_flos': 1095667077525732.0, 'train_loss': 0.1121553600386337, 'epoch': 28.42})

# Evaluation

In [26]:
trainer.evaluate()

{'eval_loss': 0.1568860113620758,
 'eval_precision': 0.7987962166809974,
 'eval_recall': 0.8649906890130353,
 'eval_f1': 0.8305766651765759,
 'eval_accuracy': 0.9642485248177716,
 'eval_runtime': 0.436,
 'eval_samples_per_second': 68.807,
 'eval_steps_per_second': 18.349,
 'epoch': 28.42}

In [27]:
predictions, labels, _ = trainer.predict(valid_set)
predictions = np.argmax(predictions, axis=2)

label2id_str = list(label2id.keys())
# Remove ignored index (special tokens)
true_predictions = [
    [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'EVENT': {'precision': 0.7213114754098361,
  'recall': 0.8148148148148148,
  'f1': 0.7652173913043478,
  'number': 54},
 'FACILITY': {'precision': 0.5555555555555556,
  'recall': 0.16129032258064516,
  'f1': 0.25,
  'number': 31},
 'LOCATION': {'precision': 0.9067796610169492,
  'recall': 0.8991596638655462,
  'f1': 0.9029535864978903,
  'number': 238},
 'NORP': {'precision': 0.4444444444444444,
  'recall': 0.8,
  'f1': 0.5714285714285714,
  'number': 5},
 'ORGANIZATION': {'precision': 0.8785714285714286,
  'recall': 0.9196261682242991,
  'f1': 0.8986301369863015,
  'number': 535},
 'PERSON': {'precision': 0.8292682926829268,
  'recall': 0.8888888888888888,
  'f1': 0.8580441640378549,
  'number': 153},
 'PRODUCT': {'precision': 0.04285714285714286,
  'recall': 0.375,
  'f1': 0.07692307692307691,
  'number': 8},
 'ROLE': {'precision': 0.6041666666666666,
  'recall': 0.6041666666666666,
  'f1': 0.6041666666666666,
  'number': 48},
 'TITLE': {'precision': 0.3333333333333333,
  'recall': 

In [28]:
os.makedirs(f"./final_model", exist_ok=True)
final_model = f"./final_model"
os.makedirs(final_model, exist_ok=True)

trainer.save_model(final_model)

os.makedirs(f"./final_tokenizer", exist_ok=True)
final_tokenizer = f"./final_tokenizer"
os.makedirs(final_tokenizer, exist_ok=True)
tokenizer.save_pretrained(final_tokenizer)

('./final_tokenizer\\tokenizer_config.json',
 './final_tokenizer\\special_tokens_map.json',
 './final_tokenizer\\vocab.txt',
 './final_tokenizer\\added_tokens.json',
 './final_tokenizer\\tokenizer.json')