<a href="https://colab.research.google.com/github/mdhvishv/NER-BERT/blob/main/BioMedBERclean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install transformers datasets tokenizers seqeval -q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m527.3/527.3 kB[0m [31m14.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m39.9/39.9 MB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m16.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for seqeval (setup.py) ... [?25l[?25hdone
[31mERROR: pip's dependency resolver

In [None]:
import datasets
import numpy as np
from transformers import BertTokenizerFast
from transformers import DataCollatorForTokenClassification
from transformers import AutoModelForTokenClassification



In [None]:
import pandas as pd

def preprocess_file(filepath):
    sentences = []
    sentence = []
    sentence_idx = 0  # Start the sentence index at 0

    with open(filepath, 'r') as file:
        for line in file:
            line = line.strip()

            # If we encounter a blank line, it indicates a new sentence
            if line == "":
                if sentence:  # Check if there's a sentence to add
                    sentences.append(sentence)
                    sentence = []
                    sentence_idx += 1
                continue

            # Skip DOCSTART
            if line.startswith("-DOCSTART-"):
                continue

            # Split the line into parts, extract word and ner_tag
            parts = line.split()
            word, ner_tag = parts[0], parts[-1]
            sentence.append({
                'sentence_idx': sentence_idx,
                'word': word,
                'ner_tag': ner_tag
            })

    # If there's a remaining sentence after the loop, add it
    if sentence:
        sentences.append(sentence)

    # Flatten the list of sentences into a single list of dictionaries
    structured_data = [item for sublist in sentences for item in sublist]

    return pd.DataFrame(structured_data)

# Preprocess the train, dev, and test datasets
train_data = preprocess_file('train.txt')
dev_data = preprocess_file('dev.txt')
test_data = preprocess_file('test.txt')


In [None]:
def group_by_sentence(dataframe):
    grouped = dataframe.groupby('sentence_idx').agg(list).reset_index()
    sentences = []
    for _, row in grouped.iterrows():
        sentences.append({
            'sentence_idx': row['sentence_idx'],
            'words': row['word'],
            'ner_tags': row['ner_tag']
        })
    return sentences

In [None]:
from datasets import Dataset, DatasetDict
train_sentences = group_by_sentence(train_data)
dev_sentences = group_by_sentence(dev_data)
test_sentences = group_by_sentence(test_data)

# Convert to Dataset objects
# Convert to Dataset objects
train_dataset = Dataset.from_dict({'sentence_idx': [s['sentence_idx'] for s in train_sentences],
                                   'words': [s['words'] for s in train_sentences],
                                   'ner_tags': [s['ner_tags'] for s in train_sentences]})
dev_dataset = Dataset.from_dict({'sentence_idx': [s['sentence_idx'] for s in dev_sentences],
                                 'words': [s['words'] for s in dev_sentences],
                                 'ner_tags': [s['ner_tags'] for s in dev_sentences]})
test_dataset = Dataset.from_dict({'sentence_idx': [s['sentence_idx'] for s in test_sentences],
                                  'words': [s['words'] for s in test_sentences],
                                  'ner_tags': [s['ner_tags'] for s in test_sentences]})
# Create a DatasetDict
conll20031= DatasetDict({
    'train': train_dataset,
    'dev': dev_dataset,
    'test': test_dataset
})

In [None]:
conll20031['train'][0]

{'sentence_idx': 0,
 'words': ['[',
  'Triple',
  'therapy',
  'regimens',
  'involving',
  'H2',
  'blockaders',
  'for',
  'therapy',
  'of',
  'Helicobacter',
  'pylori',
  'infections',
  ']',
  '.'],
 'ner_tags': ['O',
  'O',
  'O',
  'O',
  'O',
  'I-INT',
  'I-INT',
  'O',
  'O',
  'O',
  'I-OUT',
  'I-OUT',
  'I-OUT',
  'I-OUT',
  'O']}

In [None]:
!pip install nltk spacy -q
import nltk
import spacy
nltk.download('punkt')
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')
from nltk.corpus import stopwords
from string import punctuation

stop_words = set(stopwords.words('english'))

def filter_data(example):
    filtered_words = []
    filtered_ner_tags = []
    doc = nlp(' '.join(example['words']))  # Process the sentence with spaCy
    for token, ner_tag in zip(doc, example['ner_tags']):
        if token.text.lower() not in stop_words and token.text not in punctuation and ner_tag != 'O':
            filtered_words.append(token.text)
            filtered_ner_tags.append(ner_tag)
    return {'words': filtered_words, 'ner_tags': filtered_ner_tags}
filtered_dataset = conll20031.map(filter_data)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


Map:   0%|          | 0/27879 [00:00<?, ? examples/s]

Map:   0%|          | 0/7049 [00:00<?, ? examples/s]

Map:   0%|          | 0/2064 [00:00<?, ? examples/s]

In [None]:
# Define a function to convert ner_tags to integers using the label_to_id mapping
label_list = ['O', 'I-PAR', 'I-INT', 'I-OUT']  # Add all unique labels
label_to_id = {label: i for i, label in enumerate(label_list)}
def convert_labels_to_ids(example):
    example['ner_tags'] = [label_to_id[label] for label in example['ner_tags']]
    return example

# Apply the conversion to the entire dataset
conll2003 = filtered_dataset.map(convert_labels_to_ids)


Map:   0%|          | 0/27879 [00:00<?, ? examples/s]

Map:   0%|          | 0/7049 [00:00<?, ? examples/s]

Map:   0%|          | 0/2064 [00:00<?, ? examples/s]

In [None]:
conll2003['train'][0]

{'sentence_idx': 0,
 'words': ['H2', 'blockaders', 'Helicobacter', 'pylori', 'infections'],
 'ner_tags': [2, 2, 3, 3, 3]}

In [None]:
from transformers import AutoTokenizer, AutoModel
tokenizer = AutoTokenizer.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/385 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/226k [00:00<?, ?B/s]

# Problem of consecutive subwords.

### Note that transformers are often pretrained with subword tokenizers, meaning that even if your inputs have been split into words already, each of those words could be split again by the tokenizer.

### This means that we need to do some processing on our labels as the input ids returned by the tokenizer are longer than the lists of labels our dataset contain.

This is happening, first because some special tokens might be added (we can a [CLS] and a [SEP] above) and then because of those possible splits of words in multiple tokens:

## Strategy to handle above - Here we set the labels of all special tokens to -100 (the index that is ignored by PyTorch) and the labels of all other tokens to the label of the word they come from. Another strategy is to set the label only on the first token obtained from a given word, and give a label of -100 to the other subtokens from the same word. We propose the two strategies here, just change the value of the following flag:

-----------------------------------

### Setting –100 as the label for these special tokens and the subwords we wish to mask during training:

Why did we choose –100 as the ID to mask subword representations? The reason is
that in PyTorch the cross-entropy loss class torch.nn.CrossEntropyLoss has an
attribute called ignore_index whose value is –100. This index is ignored during
training,

Also we can use it to ignore the tokens associated with consecutive subwords.

-----------------------------------

## Below cell are just for checking the output of some variables before applying `tokenize_and_align_labels()`

In [None]:
example_text = conll2003['train'][0]

tokenized_input = tokenizer(example_text["words"], is_split_into_words=True)

tokens = tokenizer.convert_ids_to_tokens(tokenized_input["input_ids"])

word_ids = tokenized_input.word_ids()

print(word_ids)

''' As we can see, it returns a list with the same number of elements as our processed input ids, mapping special tokens to None and all other tokens to their respective word. This way, we can align the labels with the processed input ids. '''

tokenized_input

[None, 0, 1, 1, 2, 3, 4, None]


{'input_ids': [2, 4881, 9847, 5675, 22293, 10665, 5024, 3], 'token_type_ids': [0, 0, 0, 0, 0, 0, 0, 0], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1]}

## Problem of Sub-Token - The  input ids returned by the tokenizer are longer than the lists of labels our dataset contain.

In [None]:
len(example_text['ner_tags']), len(tokenized_input["input_ids"])
# (9, 11)

(5, 8)

## The below function `tokenize_and_align_labels` does 2 jobs

1. set –100 as the label for these special tokens and the subwords we wish to mask during training
2. mask the subword representations after the first subword


### Then we align the labels with the token ids using the strategy we picked:

In [None]:
def tokenize_and_align_labels(examples, label_all_tokens=True):
    """
    Function to tokenize and align labels with respect to the tokens. This function is specifically designed for
    Named Entity Recognition (NER) tasks where alignment of the labels is necessary after tokenization.

    Parameters:
    examples (dict): A dictionary containing the tokens and the corresponding NER tags.
                     - "tokens": list of words in a sentence.
                     - "ner_tags": list of corresponding entity tags for each word.

    label_all_tokens (bool): A flag to indicate whether all tokens should have labels.
                             If False, only the first token of a word will have a label,
                             the other tokens (subwords) corresponding to the same word will be assigned -100.

    Returns:
    tokenized_inputs (dict): A dictionary containing the tokenized inputs and the corresponding labels aligned with the tokens.
    """
    tokenized_inputs = tokenizer(examples["words"], truncation=True, is_split_into_words=True)
    labels = []
    for i, label in enumerate(examples["ner_tags"]):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        # word_ids() => Return a list mapping the tokens
        # to their actual word in the initial sentence.
        # It Returns a list indicating the word corresponding to each token.
        previous_word_idx = None
        label_ids = []
        # Special tokens like `<s>` and `<\s>` are originally mapped to None
        # We need to set the label to -100 so they are automatically ignored in the loss function.
        for word_idx in word_ids:
            if word_idx is None:
                # set –100 as the label for these special tokens
                label_ids.append(-100)
            # For the other tokens in a word, we set the label to either the current label or -100, depending on
            # the label_all_tokens flag.
            elif word_idx != previous_word_idx:
                # if current word_idx is != prev then its the most regular case
                # and add the corresponding token
                label_ids.append(label[word_idx])
            else:
                # to take care of sub-words which have the same word_idx
                # set -100 as well for them, but only if label_all_tokens == False
                label_ids.append(label[word_idx] if label_all_tokens else -100)
                # mask the subword representations after the first subword

            previous_word_idx = word_idx
        labels.append(label_ids)
    tokenized_inputs["labels"] = labels
    return tokenized_inputs

In [None]:
q = tokenize_and_align_labels(conll2003['train'][4:5])
print(q)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


{'input_ids': [[2, 2845, 2078, 27099, 6364, 1031, 4934, 4638, 1011, 3]], 'token_type_ids': [[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], 'labels': [[-100, 2, 2, 2, 2, 2, 2, 2, 2, -100]]}


In [None]:
for token, label in zip(tokenizer.convert_ids_to_tokens(q["input_ids"][0]),q["labels"][0]):
    print(f"{token:_<40} {label}")

[CLS]___________________________________ -100
ran_____________________________________ 2
##iti___________________________________ 2
##dine__________________________________ 2
rn______________________________________ 2
##t_____________________________________ 2
cam_____________________________________ 2
mt______________________________________ 2
##z_____________________________________ 2
[SEP]___________________________________ -100


In [None]:
tokenized_datasets = conll2003.map(tokenize_and_align_labels, batched=True)

Map:   0%|          | 0/27879 [00:00<?, ? examples/s]

Map:   0%|          | 0/7049 [00:00<?, ? examples/s]

Map:   0%|          | 0/2064 [00:00<?, ? examples/s]

In [None]:
import torch.nn as nn
model = AutoModelForTokenClassification.from_pretrained("microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext",num_labels=4)



pytorch_model.bin:   0%|          | 0.00/440M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at microsoft/BiomedNLP-BiomedBERT-base-uncased-abstract-fulltext and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
from transformers import TrainingArguments, Trainer
args = TrainingArguments(
"test-ner",

evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=16,
per_device_eval_batch_size=16,
num_train_epochs=3,
weight_decay=0.01,
)



In [None]:
data_collator = DataCollatorForTokenClassification(tokenizer)

In [None]:
metric = datasets.load_metric("seqeval")

  metric = datasets.load_metric("seqeval")


Downloading builder script:   0%|          | 0.00/2.47k [00:00<?, ?B/s]

The repository for seqeval contains custom code which must be executed to correctly load the dataset. You can inspect the repository content at https://hf.co/datasets/seqeval.
You can avoid this prompt in future by passing the argument `trust_remote_code=True`.

Do you wish to run the custom code? [y/N] y


In [None]:
example = conll2003['train'][0]

In [None]:
label_list

['O', 'I-PAR', 'I-INT', 'I-OUT']

In [None]:


labels = [label_list[i] for i in example["ner_tags"]]

metric.compute(predictions=[labels], references=[labels])

{'INT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'OUT': {'precision': 1.0, 'recall': 1.0, 'f1': 1.0, 'number': 1},
 'overall_precision': 1.0,
 'overall_recall': 1.0,
 'overall_f1': 1.0,
 'overall_accuracy': 1.0}

## Compute Metrics

This compute_metrics() function first takes the argmax of the logits to convert them to predictions (as usual, the logits and the probabilities are in the same order, so we don’t need to apply the softmax). Then we have to convert both labels and predictions from integers to strings. We remove all the values where the label is -100, then pass the results to the metric.compute() method:

In [None]:
def compute_metrics(eval_preds):
    """
    Function to compute the evaluation metrics for Named Entity Recognition (NER) tasks.
    The function computes precision, recall, F1 score and accuracy.

    Parameters:
    eval_preds (tuple): A tuple containing the predicted logits and the true labels.

    Returns:
    A dictionary containing the precision, recall, F1 score and accuracy.
    """
    pred_logits, labels = eval_preds

    pred_logits = np.argmax(pred_logits, axis=2)
    # the logits and the probabilities are in the same order,
    # so we don’t need to apply the softmax

    # We remove all the values where the label is -100
    predictions = [
        [label_list[eval_preds] for (eval_preds, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(pred_logits, labels)
    ]

    true_labels = [
      [label_list[l] for (eval_preds, l) in zip(prediction, label) if l != -100]
       for prediction, label in zip(pred_logits, labels)
   ]
    results = metric.compute(predictions=predictions, references=true_labels)
    return {
   "precision": results["overall_precision"],
   "recall": results["overall_recall"],
   "f1": results["overall_f1"],
  "accuracy": results["overall_accuracy"],
  }

In [None]:
trainer = Trainer(
    model,
    args,
   train_dataset=tokenized_datasets["train"],
   eval_dataset=tokenized_datasets["dev"],
   data_collator=data_collator,
   tokenizer=tokenizer,
   compute_metrics=compute_metrics
)

In [None]:
trainer.train()

Epoch,Training Loss,Validation Loss,Precision,Recall,F1,Accuracy
1,0.4376,0.429822,0.539153,0.503629,0.520786,0.842177
2,0.3605,0.429611,0.553582,0.513496,0.532786,0.845013
3,0.2972,0.440917,0.555778,0.52165,0.538174,0.846165


TrainOutput(global_step=5229, training_loss=0.37681338262275293, metrics={'train_runtime': 995.8671, 'train_samples_per_second': 83.984, 'train_steps_per_second': 5.251, 'total_flos': 1826925032523408.0, 'train_loss': 0.37681338262275293, 'epoch': 3.0})

In [None]:
model.save_pretrained("ner_model")

In [None]:
!pip install torch



In [None]:
# After running trainer.predict(tokenized_datasets["test"])
test_results = trainer.predict(tokenized_datasets["test"])
print(test_results.metrics)
# Extract token-level predictions and true labels
true_labels = []
predicted_probs = [] # Change variable name to reflect probabilities
for prediction, label in zip(test_results.predictions, test_results.label_ids):
    for predicted_prob, true_idx in zip(prediction, label):
        if true_idx != -100:  # Ignore special tokens
            true_labels.append(true_idx)
            predicted_probs.append(predicted_prob) # Store probabilities instead of indices
import torch
# Normalize predicted probabilities to sum to 1 for each sample
import torch.nn.functional as F
predicted_probs = F.softmax(torch.tensor(predicted_probs), dim=1).numpy()

# Compute confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(true_labels, np.argmax(predicted_probs, axis=1)) # Get predicted indices for confusion matrix
print("Confusion Matrix:")
print(cm)

{'test_loss': 0.2511827349662781, 'test_precision': 0.7767571884984026, 'test_recall': 0.7684709600948242, 'test_f1': 0.7725918570009931, 'test_accuracy': 0.9220386974988202, 'test_runtime': 3.2423, 'test_samples_per_second': 636.591, 'test_steps_per_second': 39.787}
Confusion Matrix:
[[2824  151   70]
 [  84 3127  215]
 [  83  223 3818]]


  predicted_probs = F.softmax(torch.tensor(predicted_probs), dim=1).numpy()


In [None]:
tokenizer.save_pretrained("tokenizer")

('tokenizer/tokenizer_config.json',
 'tokenizer/special_tokens_map.json',
 'tokenizer/vocab.txt',
 'tokenizer/added_tokens.json',
 'tokenizer/tokenizer.json')

In [None]:
id2label = {
    str(i): label for i,label in enumerate(label_list)
}
label2id = {
    label: str(i) for i,label in enumerate(label_list)
}

In [None]:
import json

In [None]:
config = json.load(open("ner_model/config.json"))

In [None]:
config["id2label"] = id2label
config["label2id"] = label2id

In [None]:
json.dump(config, open("ner_model/config.json","w"))

In [None]:
model_fine_tuned = AutoModelForTokenClassification.from_pretrained("ner_model")

In [None]:
from transformers import pipeline

In [None]:
nlp = pipeline("ner", model=model_fine_tuned, tokenizer=tokenizer)


example = "Bill Gates is the Founder of Microsoft"

ner_results = nlp(example)

print(ner_results)

Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


[{'entity': 'I-PAR', 'score': 0.58707446, 'index': 1, 'word': 'bill', 'start': 0, 'end': 4}, {'entity': 'I-PAR', 'score': 0.67343783, 'index': 2, 'word': 'gates', 'start': 5, 'end': 10}, {'entity': 'I-PAR', 'score': 0.82685256, 'index': 3, 'word': 'is', 'start': 11, 'end': 13}, {'entity': 'I-PAR', 'score': 0.84685427, 'index': 4, 'word': 'the', 'start': 14, 'end': 17}, {'entity': 'I-PAR', 'score': 0.8137533, 'index': 5, 'word': 'founder', 'start': 18, 'end': 25}, {'entity': 'I-PAR', 'score': 0.6825755, 'index': 6, 'word': 'of', 'start': 26, 'end': 28}, {'entity': 'I-INT', 'score': 0.47125396, 'index': 7, 'word': 'microsoft', 'start': 29, 'end': 38}]
