# Import Library

In [1]:
import os
import json
import torch
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from datasets import load_metric, load_dataset, Dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DRIVER_LETTER = os.path.splitdrive(os.getcwd())[0]

## Configurations

In [3]:
FURTHER_PRETRAINED_FOLDER = "bert_base_scratch_with_men_tokenizer"
logging_dir = "./logs_final"
output_dir = "./pretrained_bert_final"

In [4]:
# Define hyperparameter grid
with open("best-parameter.json","r") as f:
    hyperparameters = json.load(f)
print(hyperparameters)

{'learning_rate': 5e-05, 'num_train_epochs': 30, 'per_device_train_batch_size': 4, 'weight_decay': 0.01, 'assignments': {}, 'metric': 'eval/loss'}


In [5]:
FP_DIR = "../../pretrained-lm/MENBERT-SC"

In [6]:
FP_DIR

'C:/Users/mcha0133/Desktop/Everything About BERT Training/Further Pretraining BERT/bert_base_scratch_with_men_tokenizer/epoch-optimized'

In [7]:
tokenizer = BertTokenizerFast.from_pretrained(f"{FP_DIR}/final_vocab", model_max_length=512)

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [8]:
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Import Dataset

In [11]:
train = pd.read_json("./bio-tag-dataset/train_bio.json")
train = train[["text","entities"]]
train["token"] = train["text"].apply(lambda x: word_tokenize(x))

test = pd.read_json("./bio-tag-dataset/test_bio.json")
test = test[["text","entities"]]
test["token"] = test["text"].apply(lambda x: word_tokenize(x))

valid = pd.read_json("./bio-tag-dataset/validation_bio.json")
valid = valid[["text","entities"]]
valid["token"] = valid["text"].apply(lambda x: word_tokenize(x))

print(train.shape)
print(test.shape)
print(valid.shape)

(150, 3)
(20, 3)
(30, 3)


# Format BIO Dataset - Processing

In [12]:
# Split labels based on whitespace and turn them into a list
labels = [i for i in train['entities'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

unique_labels = list(unique_labels)
print(unique_labels)

# Map each label into its id representation and vice versa
label2id = {k: v for v, k in enumerate(sorted(unique_labels))}
id2label  = {v: k for v, k in enumerate(sorted(unique_labels))}
print(label2id)

['O', 'B-PRODUCT', 'B-LAW', 'I-TITLE', 'I-LAW', 'I-PRODUCT', 'B-ORGANIZATION', 'B-LOCATION', 'I-FACILITY', 'I-LOCATION', 'B-NORP', 'I-ROLE', 'I-EVENT', 'B-PERSON', 'B-FACILITY', 'B-WORK_OF_ART', 'B-TITLE', 'I-WORK_OF_ART', 'I-NORP', 'B-EVENT', 'B-ROLE', 'I-PERSON', 'I-ORGANIZATION']
{'B-EVENT': 0, 'B-FACILITY': 1, 'B-LAW': 2, 'B-LOCATION': 3, 'B-NORP': 4, 'B-ORGANIZATION': 5, 'B-PERSON': 6, 'B-PRODUCT': 7, 'B-ROLE': 8, 'B-TITLE': 9, 'B-WORK_OF_ART': 10, 'I-EVENT': 11, 'I-FACILITY': 12, 'I-LAW': 13, 'I-LOCATION': 14, 'I-NORP': 15, 'I-ORGANIZATION': 16, 'I-PERSON': 17, 'I-PRODUCT': 18, 'I-ROLE': 19, 'I-TITLE': 20, 'I-WORK_OF_ART': 21, 'O': 22}


In [13]:
train['entities'] = train['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
test['entities'] = test['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
valid['entities'] = valid['entities'].apply(lambda entities: [label2id[ent] for ent in entities])

In [14]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
valid_dataset = Dataset.from_pandas(valid)

# Tokenize Dataset

In [15]:
label_all_tokens = True

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["token"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [17]:
training_set = train_dataset.map(tokenize_and_align_labels, batched=True)
testing_set = test_dataset.map(tokenize_and_align_labels, batched=True)
valid_set = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 680.28 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 691.06 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 683.34 examples/s]


# Set an Initialize BERT Model

In [18]:
# model = BertForTokenClassification.from_pretrained('bert-base-cased', 
#                                                    num_labels=len(id2label),
#                                                    id2label=id2label,
#                                                    label2id=label2id)
def model_init():
    return(BertForTokenClassification.from_pretrained(f"{FP_DIR}/final_bert", num_labels=len(id2label), id2label=id2label, label2id=label2id))

In [19]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=hyperparameters["num_train_epochs"],
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
    per_device_eval_batch_size=hyperparameters["per_device_train_batch_size"],
    save_total_limit=2,
    logging_dir=logging_dir,
    learning_rate=hyperparameters["learning_rate"],
    weight_decay=hyperparameters['weight_decay'],
    logging_first_step=True,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy="epoch", 
    resume_from_checkpoint=True,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adafactor",
    metric_for_best_model="eval_f1",
    report_to="wandb",
    #optim="adamw_bnb_8bit",
    run_name=FURTHER_PRETRAINED_FOLDER,
    load_best_model_at_end=True,
    disable_tqdm=False  # Disable tqdm progress bar if desired
)

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [21]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    label2id_str = list(label2id.keys())
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=training_set,
    eval_dataset=valid_set,
    model_init=model_init,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at C:/Users/mcha0133/Desktop/Everything About BERT Training/Further Pretraining BERT/bert_base_scratch_with_men_tokenizer/epoch-optimized/final_bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
%%time

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at C:/Users/mcha0133/Desktop/Everything About BERT Training/Further Pretraining BERT/bert_base_scratch_with_men_tokenizer/epoch-optimized/final_bert and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohanrj-nlp[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,1.3267,0.933364,0.128723,0.097738,0.111111,0.816529
2,0.875,0.656498,0.230366,0.177706,0.200638,0.835851
4,0.7472,0.59053,0.322259,0.235057,0.271836,0.843931
6,0.6646,0.548193,0.28441,0.327141,0.304282,0.852187
8,0.6009,0.526188,0.257899,0.283522,0.270104,0.851045
10,0.5638,0.526912,0.290429,0.28433,0.287347,0.854734
12,0.5166,0.557888,0.313901,0.282714,0.297493,0.854646
14,0.4827,0.544166,0.266086,0.320679,0.290842,0.852538
16,0.4522,0.538743,0.274204,0.340872,0.303925,0.853065
18,0.4254,0.535896,0.293832,0.342488,0.3163,0.857105


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


CPU times: total: 1min 55s
Wall time: 1min 59s


TrainOutput(global_step=270, training_loss=0.5371108854258502, metrics={'train_runtime': 116.9107, 'train_samples_per_second': 38.491, 'train_steps_per_second': 2.309, 'total_flos': 548561825650020.0, 'train_loss': 0.5371108854258502, 'epoch': 28.42})

# Evaluation

In [28]:
trainer.evaluate()

{'eval_loss': 0.5358961224555969,
 'eval_precision': 0.2938322938322938,
 'eval_recall': 0.34248788368336025,
 'eval_f1': 0.3162998881014547,
 'eval_accuracy': 0.8571052169330757,
 'eval_runtime': 0.5691,
 'eval_samples_per_second': 52.713,
 'eval_steps_per_second': 14.057,
 'epoch': 28.42}

In [29]:
predictions, labels, _ = trainer.predict(valid_set)
predictions = np.argmax(predictions, axis=2)

label2id_str = list(label2id.keys())
# Remove ignored index (special tokens)
true_predictions = [
    [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'EVENT': {'precision': 0.1346153846153846,
  'recall': 0.125,
  'f1': 0.12962962962962965,
  'number': 56},
 'FACILITY': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 36},
 'LOCATION': {'precision': 0.38823529411764707,
  'recall': 0.6387096774193548,
  'f1': 0.4829268292682927,
  'number': 310},
 'NORP': {'precision': 0.21428571428571427,
  'recall': 0.25,
  'f1': 0.23076923076923075,
  'number': 12},
 'ORGANIZATION': {'precision': 0.3225806451612903,
  'recall': 0.2763385146804836,
  'f1': 0.29767441860465116,
  'number': 579},
 'PERSON': {'precision': 0.13291139240506328,
  'recall': 0.22702702702702704,
  'f1': 0.16766467065868262,
  'number': 185},
 'PRODUCT': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 10},
 'ROLE': {'precision': 0.4482758620689655,
  'recall': 0.2826086956521739,
  'f1': 0.3466666666666666,
  'number': 46},
 'TITLE': {'precision': 0.14285714285714285,
  'recall': 0.25,
  'f1': 0.18181818181818182,
  'number': 4},
 'overall_precision': 0.2938

In [30]:
os.makedirs(f"./final_model", exist_ok=True)
final_model = f"./final_model"
os.makedirs(final_model, exist_ok=True)

trainer.save_model(final_model)

os.makedirs(f"./final_tokenizer", exist_ok=True)
final_tokenizer = f"./final_tokenizer"
os.makedirs(final_tokenizer, exist_ok=True)
tokenizer.save_pretrained(final_tokenizer)

('./final_tokenizer\\tokenizer_config.json',
 './final_tokenizer\\special_tokens_map.json',
 './final_tokenizer\\vocab.txt',
 './final_tokenizer\\added_tokens.json',
 './final_tokenizer\\tokenizer.json')