# Import Library

In [1]:
import os
import json
import torch
import numpy as np
import pandas as pd
from nltk.tokenize import word_tokenize
from datasets import load_metric, load_dataset, Dataset
from transformers import BertTokenizerFast, BertForTokenClassification, TrainingArguments, Trainer, DataCollatorForTokenClassification

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
DRIVER_LETTER = os.path.splitdrive(os.getcwd())[0]

## Configurations

In [3]:
FURTHER_PRETRAINED = "bert-base-multilingual-cased"
logging_dir = "./logs_final"
output_dir = "./pretrained_bert_final"

In [4]:
# Define hyperparameter grid
with open("best-parameter.json","r") as f:
    hyperparameters = json.load(f)
print(hyperparameters)

{'learning_rate': 5e-05, 'num_train_epochs': 30, 'per_device_train_batch_size': 4, 'weight_decay': 0.01, 'assignments': {}, 'metric': 'eval/loss'}


In [7]:
tokenizer = BertTokenizerFast.from_pretrained("bert-base-multilingual-cased")

In [8]:
torch.cuda.empty_cache()

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
torch.cuda.get_device_name(0)
print('Using device:', device)
print()

#Additional Info when using cuda
if device.type == 'cuda':
    print(torch.cuda.get_device_name(0))
    print('Memory Usage:')
    print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
    print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

Using device: cuda

NVIDIA GeForce RTX 2080
Memory Usage:
Allocated: 0.0 GB
Cached:    0.0 GB


In [9]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cuda


# Import Dataset

In [11]:
train = pd.read_json("./bio-tag-dataset/train_bio.json")
train = train[["text","entities"]]
train["token"] = train["text"].apply(lambda x: word_tokenize(x))

test = pd.read_json("./bio-tag-dataset/test_bio.json")
test = test[["text","entities"]]
test["token"] = test["text"].apply(lambda x: word_tokenize(x))

valid = pd.read_json("./bio-tag-dataset/validation_bio.json")
valid = valid[["text","entities"]]
valid["token"] = valid["text"].apply(lambda x: word_tokenize(x))

print(train.shape)
print(test.shape)
print(valid.shape)

(150, 3)
(20, 3)
(30, 3)


# Format BIO Dataset - Processing

In [12]:
# Split labels based on whitespace and turn them into a list
labels = [i for i in train['entities'].values.tolist()]

# Check how many labels are there in the dataset
unique_labels = set()

for lb in labels:
    [unique_labels.add(i) for i in lb if i not in unique_labels]

unique_labels = list(unique_labels)
print(unique_labels)

# Map each label into its id representation and vice versa
label2id = {k: v for v, k in enumerate(sorted(unique_labels))}
id2label  = {v: k for v, k in enumerate(sorted(unique_labels))}
print(label2id)

['I-PRODUCT', 'I-EVENT', 'O', 'I-PERSON', 'B-TITLE', 'B-EVENT', 'B-PRODUCT', 'B-ORGANIZATION', 'B-FACILITY', 'I-ROLE', 'B-LAW', 'B-LOCATION', 'I-ORGANIZATION', 'B-ROLE', 'I-FACILITY', 'I-WORK_OF_ART', 'I-LOCATION', 'B-NORP', 'I-TITLE', 'I-NORP', 'I-LAW', 'B-WORK_OF_ART', 'B-PERSON']
{'B-EVENT': 0, 'B-FACILITY': 1, 'B-LAW': 2, 'B-LOCATION': 3, 'B-NORP': 4, 'B-ORGANIZATION': 5, 'B-PERSON': 6, 'B-PRODUCT': 7, 'B-ROLE': 8, 'B-TITLE': 9, 'B-WORK_OF_ART': 10, 'I-EVENT': 11, 'I-FACILITY': 12, 'I-LAW': 13, 'I-LOCATION': 14, 'I-NORP': 15, 'I-ORGANIZATION': 16, 'I-PERSON': 17, 'I-PRODUCT': 18, 'I-ROLE': 19, 'I-TITLE': 20, 'I-WORK_OF_ART': 21, 'O': 22}


In [13]:
train['entities'] = train['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
test['entities'] = test['entities'].apply(lambda entities: [label2id[ent] for ent in entities])
valid['entities'] = valid['entities'].apply(lambda entities: [label2id[ent] for ent in entities])

In [14]:
train_dataset = Dataset.from_pandas(train)
test_dataset = Dataset.from_pandas(test)
valid_dataset = Dataset.from_pandas(valid)

# Tokenize Dataset

In [15]:
label_all_tokens = True

In [16]:
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(examples["token"], truncation=True, is_split_into_words=True)

    labels = []
    for i, label in enumerate(examples["entities"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        previous_word_idx = None
        label_ids = []
        for word_idx in word_ids:
            # Special tokens have a word id that is None. We set the label to -100 so they are automatically
            # ignored in the loss function.
            if word_idx is None:
                label_ids.append(-100)
            # We set the label for the first token of each word.
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            else:
                label_ids.append(label[word_idx] if label_all_tokens else -100)
            previous_word_idx = word_idx

        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [17]:
training_set = train_dataset.map(tokenize_and_align_labels, batched=True)
testing_set = test_dataset.map(tokenize_and_align_labels, batched=True)
valid_set = valid_dataset.map(tokenize_and_align_labels, batched=True)

Map: 100%|███████████████████████████████████████████████████████████████████| 150/150 [00:00<00:00, 692.96 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 20/20 [00:00<00:00, 715.47 examples/s]
Map: 100%|█████████████████████████████████████████████████████████████████████| 30/30 [00:00<00:00, 751.43 examples/s]


# Set an Initialize BERT Model

In [18]:
# model = BertForTokenClassification.from_pretrained('bert-base-cased', 
#                                                    num_labels=len(id2label),
#                                                    id2label=id2label,
#                                                    label2id=label2id)
def model_init():
    return(BertForTokenClassification.from_pretrained("bert-base-multilingual-cased", num_labels=len(id2label), id2label=id2label, label2id=label2id))

In [19]:
# Define training arguments
training_args = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=hyperparameters["num_train_epochs"],
    per_device_train_batch_size=hyperparameters["per_device_train_batch_size"],
    per_device_eval_batch_size=hyperparameters["per_device_train_batch_size"],
    save_total_limit=2,
    logging_dir=logging_dir,
    learning_rate=hyperparameters["learning_rate"],
    weight_decay=hyperparameters['weight_decay'],
    logging_first_step=True,
    evaluation_strategy = "epoch",
    logging_strategy="epoch",
    save_strategy="epoch", 
    resume_from_checkpoint=True,
    fp16=True,
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="adafactor",
    metric_for_best_model="eval_f1",
    report_to="wandb",
    #optim="adamw_bnb_8bit",
    run_name=FURTHER_PRETRAINED_FOLDER,
    load_best_model_at_end=True,
    disable_tqdm=False  # Disable tqdm progress bar if desired
)

In [20]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [21]:
metric = load_metric("seqeval")

  metric = load_metric("seqeval")


In [22]:
def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    label2id_str = list(label2id.keys())
    
    # Remove ignored index (special tokens)
    true_predictions = [
        [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]
    true_labels = [
        [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

In [23]:
trainer = Trainer(
    args=training_args,
    tokenizer=tokenizer,
    train_dataset=training_set,
    eval_dataset=valid_set,
    model_init=model_init,
    data_collator=data_collator,
    compute_metrics=compute_metrics,
)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [24]:
%%time

trainer.train()

Some weights of BertForTokenClassification were not initialized from the model checkpoint at bert-base-multilingual-cased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Failed to detect the name of this notebook, you can set it manually with the WANDB_NOTEBOOK_NAME environment variable to enable code saving.
[34m[1mwandb[0m: Currently logged in as: [33mmohanrj-nlp[0m. Use [1m`wandb login --relogin`[0m to force relogin


You're using a BertTokenizerFast tokenizer. Please note that with a fast tokenizer, using the `__call__` method is faster than using a method to encode the text followed by a call to the `pad` method to get a padded encoding.


Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
0,1.5358,0.898907,0.116279,0.097765,0.106222,0.817772
2,0.3663,0.227793,0.495063,0.653631,0.563403,0.937261
4,0.2156,0.149353,0.716965,0.783054,0.748554,0.9626
6,0.1292,0.124871,0.831303,0.825885,0.828585,0.970236
8,0.0896,0.127527,0.819266,0.831471,0.825323,0.970496
10,0.0678,0.14116,0.753413,0.82216,0.786287,0.961211
12,0.0481,0.137836,0.782272,0.846369,0.813059,0.965724
14,0.0399,0.151513,0.79965,0.851024,0.824538,0.96555
16,0.0312,0.154306,0.802295,0.846369,0.823743,0.967633
18,0.0247,0.158052,0.7979,0.849162,0.822733,0.966505


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_pr

CPU times: total: 3min 35s
Wall time: 3min 43s


TrainOutput(global_step=270, training_loss=0.1416078432014695, metrics={'train_runtime': 219.9801, 'train_samples_per_second': 20.456, 'train_steps_per_second': 1.227, 'total_flos': 1095667077525732.0, 'train_loss': 0.1416078432014695, 'epoch': 28.42})

# Evaluation

In [25]:
trainer.evaluate()

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'eval_loss': 0.12487058341503143,
 'eval_precision': 0.831302717900656,
 'eval_recall': 0.8258845437616388,
 'eval_f1': 0.8285847734703409,
 'eval_accuracy': 0.9702360291565428,
 'eval_runtime': 0.4493,
 'eval_samples_per_second': 66.774,
 'eval_steps_per_second': 17.806,
 'epoch': 28.42}

In [28]:
predictions, labels, _ = trainer.predict(valid_set)
predictions = np.argmax(predictions, axis=2)

label2id_str = list(label2id.keys())
# Remove ignored index (special tokens)
true_predictions = [
    [label2id_str[p] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]
true_labels = [
    [label2id_str[l] for (p, l) in zip(prediction, label) if l != -100]
    for prediction, label in zip(predictions, labels)
]

results = metric.compute(predictions=true_predictions, references=true_labels)
results

{'EVENT': {'precision': 0.6595744680851063,
  'recall': 0.5740740740740741,
  'f1': 0.613861386138614,
  'number': 54},
 'FACILITY': {'precision': 0.2857142857142857,
  'recall': 0.06451612903225806,
  'f1': 0.10526315789473685,
  'number': 31},
 'LAW': {'precision': 0.0, 'recall': 0.0, 'f1': 0.0, 'number': 0},
 'LOCATION': {'precision': 0.8653061224489796,
  'recall': 0.8907563025210085,
  'f1': 0.8778467908902693,
  'number': 238},
 'NORP': {'precision': 0.6, 'recall': 0.6, 'f1': 0.6, 'number': 5},
 'ORGANIZATION': {'precision': 0.8834244080145719,
  'recall': 0.9065420560747663,
  'f1': 0.8948339483394833,
  'number': 535},
 'PERSON': {'precision': 0.8209876543209876,
  'recall': 0.869281045751634,
  'f1': 0.8444444444444443,
  'number': 153},
 'PRODUCT': {'precision': 0.3,
  'recall': 0.375,
  'f1': 0.33333333333333326,
  'number': 8},
 'ROLE': {'precision': 0.43902439024390244,
  'recall': 0.375,
  'f1': 0.40449438202247195,
  'number': 48},
 'TITLE': {'precision': 0.0, 'recall': 

In [29]:
os.makedirs(f"./final_model", exist_ok=True)
final_model = f"./final_model"
os.makedirs(final_model, exist_ok=True)

trainer.save_model(final_model)

os.makedirs(f"./final_tokenizer", exist_ok=True)
final_tokenizer = f"./final_tokenizer"
os.makedirs(final_tokenizer, exist_ok=True)
tokenizer.save_pretrained(final_tokenizer)

('./final_tokenizer\\tokenizer_config.json',
 './final_tokenizer\\special_tokens_map.json',
 './final_tokenizer\\vocab.txt',
 './final_tokenizer\\added_tokens.json',
 './final_tokenizer\\tokenizer.json')