# **Enabling BioBERT to Question Answering using Stanford Question Answering Dataset (version01)**

In [2]:
!pip install tensorboard
!pip install -U scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl.metadata (15 kB)
Downloading scikit_learn-1.6.1-cp312-cp312-win_amd64.whl (11.1 MB)
   ---------------------------------------- 0.0/11.1 MB ? eta -:--:--
    --------------------------------------- 0.3/11.1 MB ? eta -:--:--
   ----- ---------------------------------- 1.6/11.1 MB 6.5 MB/s eta 0:00:02
   ---------------- ----------------------- 4.5/11.1 MB 9.3 MB/s eta 0:00:01
   ------------------------ --------------- 6.8/11.1 MB 10.0 MB/s eta 0:00:01
   -------------------------------- ------- 8.9/11.1 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------  11.0/11.1 MB 10.1 MB/s eta 0:00:01
   ---------------------------------------- 11.1/11.1 MB 9.8 MB/s eta 0:00:00
Installing collected packages: scikit-learn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 1.5.1
    Uninstalling scikit-learn-1.5.1:
      Successfully uninstalled scikit-learn-1.5.1
Succes

  You can safely remove it manually.


In [3]:
!pip install -U transformers
!pip install transformers datasets
!pip install transformers datasets evaluate
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu121
!pip install transformers[torch]

Collecting transformers
  Downloading transformers-4.51.3-py3-none-any.whl.metadata (38 kB)
Collecting huggingface-hub<1.0,>=0.30.0 (from transformers)
  Downloading huggingface_hub-0.30.2-py3-none-any.whl.metadata (13 kB)
Downloading transformers-4.51.3-py3-none-any.whl (10.4 MB)
   ---------------------------------------- 0.0/10.4 MB ? eta -:--:--
   -- ------------------------------------- 0.5/10.4 MB 3.4 MB/s eta 0:00:03
   -------- ------------------------------- 2.1/10.4 MB 6.2 MB/s eta 0:00:02
   ------------------- -------------------- 5.0/10.4 MB 9.4 MB/s eta 0:00:01
   ---------------------------- ----------- 7.3/10.4 MB 10.1 MB/s eta 0:00:01
   -------------------------------------- - 10.0/10.4 MB 10.5 MB/s eta 0:00:01
   ---------------------------------------- 10.4/10.4 MB 10.3 MB/s eta 0:00:00
Downloading huggingface_hub-0.30.2-py3-none-any.whl (481 kB)
Installing collected packages: huggingface-hub, transformers
  Attempting uninstall: huggingface-hub
    Found existing 

In [1]:
from datasets import load_dataset, concatenate_datasets
from datasets import DatasetDict
from datasets import Dataset
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
from transformers import AutoTokenizer, pipeline
from transformers import default_data_collator
from transformers import TrainingArguments
from transformers import AutoModelForQuestionAnswering
from transformers import Trainer, default_data_collator
from transformers.trainer_utils import EvalPrediction
from transformers import EarlyStoppingCallback
import evaluate, torch, json
from tqdm import tqdm
import numpy as np
import warnings, os, re, string
warnings.filterwarnings("ignore")
from collections import OrderedDict
from evaluate import load
torch.cuda.is_available()

ModuleNotFoundError: No module named 'evaluate'

## BioBERT and SQuaDv1 background check

In [7]:
model_pretrained = AutoModelForQuestionAnswering.from_pretrained("dmis-lab/biobert-base-cased-v1.1")
tokenizer_raw = AutoTokenizer.from_pretrained("dmis-lab/biobert-base-cased-v1.1")

qa_raw = pipeline("question-answering", model=model_pretrained, tokenizer=tokenizer_raw)

sample = {
    "context": "Messenger RNA (mRNA) carries the genetic information from DNA to the ribosome, where proteins are synthesized.",
    "question": "What does mRNA do?"
}

result = qa_raw(sample)
print("Answer:", result["answer"])
print("Score:", round(result["score"], 4))

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Device set to use cuda:0


Answer: to the ribosome, where proteins are
Score: 0.0023


In [5]:
checkpoint = "dmis-lab/biobert-base-cased-v1.1"
model = AutoModelForQuestionAnswering.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)

Some weights of BertForQuestionAnswering were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['qa_outputs.bias', 'qa_outputs.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
datasets = load_dataset("squad")

full_dataset = concatenate_datasets([datasets["train"], datasets["validation"]])
train_valid_split = full_dataset.train_test_split(test_size=0.2, seed=42)
valid_test_split = train_valid_split["test"].train_test_split(test_size=0.5, seed=42)

train_dataset = train_valid_split["train"]
validation_dataset = valid_test_split["train"]
test_dataset = valid_test_split["test"]

print(f"Train size: {len(train_dataset)}")
print(f"Validation size: {len(validation_dataset)}")
print(f"Test size: {len(test_dataset)}")

Train size: 78535
Validation size: 9817
Test size: 9817


## Data preprocessing

In [7]:
def preprocess_function(examples):
    questions = [q.strip() for q in examples["question"]]
    
    inputs = tokenizer(
        questions,
        examples["context"],
        max_length=384,
        truncation="only_second",   # Truncate context if too long
        stride=128,                 # Overlap if splitting
        return_overflowing_tokens=True,
        return_offsets_mapping=True,
        padding="max_length",
    )

    sample_mapping = inputs.pop("overflow_to_sample_mapping")
    offset_mapping = inputs.pop("offset_mapping")

    start_positions = []
    end_positions = []

    for i, offsets in enumerate(offset_mapping):
        input_ids = inputs["input_ids"][i]
        cls_index = input_ids.index(tokenizer.cls_token_id)  # [CLS] token

        sequence_ids = inputs.sequence_ids(i)
        sample_index = sample_mapping[i]
        answers = examples["answers"][sample_index]
        
        if len(answers["answer_start"]) == 0:
            start_positions.append(cls_index)
            end_positions.append(cls_index)
        else:
            start_char = answers["answer_start"][0]
            end_char = start_char + len(answers["text"][0])

            token_start_index = 0
            while sequence_ids[token_start_index] != 1:
                token_start_index += 1

            token_end_index = len(input_ids) - 1
            while sequence_ids[token_end_index] != 1:
                token_end_index -= 1

            if not (offsets[token_start_index][0] <= start_char and offsets[token_end_index][1] >= end_char):
                start_positions.append(cls_index)
                end_positions.append(cls_index)
            else:
                while token_start_index < len(offsets) and offsets[token_start_index][0] <= start_char:
                    token_start_index += 1
                start_positions.append(token_start_index - 1)

                while offsets[token_end_index][1] >= end_char:
                    token_end_index -= 1
                end_positions.append(token_end_index + 1)

    inputs["start_positions"] = start_positions
    inputs["end_positions"] = end_positions

    return inputs

In [8]:
tokenized_train = train_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=train_dataset.column_names,
)

tokenized_validation = validation_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=validation_dataset.column_names,
)

tokenized_test = test_dataset.map(
    preprocess_function,
    batched=True,
    remove_columns=test_dataset.column_names,
)

print(f"Train size: {len(tokenized_train)}")
print(f"Validation size: {len(tokenized_validation)}")
print(f"Test size: {len(tokenized_test)}")

Map:   0%|          | 0/78535 [00:00<?, ? examples/s]

Map:   0%|          | 0/9817 [00:00<?, ? examples/s]

Map:   0%|          | 0/9817 [00:00<?, ? examples/s]

Train size: 80219
Validation size: 10034
Test size: 9999


In [4]:
def normalize_answer(s):
    """Lower text and remove punctuation, articles and extra whitespace."""

    def remove_articles(text):
        return re.sub(r'\b(a|an|the)\b', ' ', text)

    def white_space_fix(text):
        return ' '.join(text.split())

    def remove_punc(text):
        exclude = set(string.punctuation)
        return ''.join(ch for ch in text if ch not in exclude)

    def lower(text):
        return text.lower()

    return white_space_fix(remove_articles(remove_punc(lower(s))))

In [5]:
def evaluate_qa(y_true, y_pred):
    assert len(y_true) == len(y_pred) # y_true and y_pred must be the same length

    exact_matches = []
    f1s = []

    for true_ans, pred_ans in zip(y_true, y_pred):
        true_ans = normalize_answer(true_ans)
        pred_ans = normalize_answer(pred_ans)

        # Exact match:
        exact = int(true_ans == pred_ans)
        exact_matches.append(exact)

        # F1 score
        true_tokens = true_ans.split()
        pred_tokens = pred_ans.split()

        common = set(true_tokens) & set(pred_tokens)
        if len(common) == 0:
            f1 = 0
        else:
            precision = len(common) / len(pred_tokens)
            recall = len(common) / len(true_tokens)
            f1 = (2 * precision * recall) / (precision + recall)
        f1s.append(f1)

    # Overall metrics
    exact_match = np.mean(exact_matches)
    f1 = np.mean(f1s)

    print(f"Exact Match (EM): {exact_match: .4f}")
    print(f"F1 Score: {f1: .4f}")

    return exact_match, f1

In [8]:
y_true = []
y_pred = []

for example in tqdm(test_dataset):
    context = example["context"]
    question = example["question"]
    true_answer = example["answers"]["text"][0]

    prediction = qa_raw(question=question, context=context)

    y_true.append(true_answer)
    y_pred.append(prediction["answer"])

  0%|                                                                                 | 9/9817 [00:00<01:55, 84.74it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████████████████████████████████████████| 9817/9817 [01:11<00:00, 137.56it/s]


In [9]:
evaluate_qa(y_true, y_pred)

Exact Match (EM):  0.0028
F1 Score:  0.0509


(np.float64(0.002750331058368137), np.float64(0.05093859894513045))

## Fine-tuning

In [11]:
os.makedirs("./tensor_logs", exist_ok = True)

In [12]:
output_dir="biobert-enhanced-cp"

training_args = TrainingArguments(
    output_dir=output_dir,
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=3e-5,
    per_device_train_batch_size=24,
    per_device_eval_batch_size=24,
    gradient_accumulation_steps=2,
    gradient_checkpointing=True,
    num_train_epochs=4,
    weight_decay=0.01,
    save_total_limit=2,
    logging_dir="./logs",
    logging_steps=50,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    fp16=True,
    report_to="tensorboard",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False
)

In [13]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_validation,
    tokenizer=tokenizer,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [14]:
trainer.train()

Epoch,Training Loss,Validation Loss
1,1.1251,1.027434
2,0.8315,0.956104
3,0.5656,1.056846


TrainOutput(global_step=6684, training_loss=0.9148379887455884, metrics={'train_runtime': 3827.9763, 'train_samples_per_second': 83.824, 'train_steps_per_second': 1.746, 'total_flos': 6.285251783784499e+16, 'train_loss': 0.9148379887455884, 'epoch': 3.9979060723900686})

In [15]:
save_path = "./biobert-enhanced-final"
trainer.save_model(save_path)
tokenizer.save_pretrained(save_path)

('./biobert-enhanced-final\\tokenizer_config.json',
 './biobert-enhanced-final\\special_tokens_map.json',
 './biobert-enhanced-final\\vocab.txt',
 './biobert-enhanced-final\\added_tokens.json',
 './biobert-enhanced-final\\tokenizer.json')

In [4]:
%load_ext tensorboard
%tensorboard --logdir ./logs

  You can safely remove it manually.


Looking in indexes: https://download.pytorch.org/whl/cu121
Collecting torchvision
  Downloading https://download.pytorch.org/whl/cu121/torchvision-0.20.1%2Bcu121-cp312-cp312-win_amd64.whl (6.1 MB)
     ---------------------------------------- 0.0/6.1 MB ? eta -:--:--
     --------------- ------------------------ 2.4/6.1 MB 11.2 MB/s eta 0:00:01
     -------------------------------- ------- 5.0/6.1 MB 11.6 MB/s eta 0:00:01
     ---------------------------------------- 6.1/6.1 MB 11.2 MB/s eta 0:00:00
Collecting torchaudio
  Downloading https://download.pytorch.org/whl/cu121/torchaudio-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl (4.1 MB)
     ---------------------------------------- 0.0/4.1 MB ? eta -:--:--
     ---------------------- ----------------- 2.4/4.1 MB 12.2 MB/s eta 0:00:01
     ---------------------------------------- 4.1/4.1 MB 10.8 MB/s eta 0:00:00
Collecting torch
  Downloading https://download.pytorch.org/whl/cu121/torch-2.5.1%2Bcu121-cp312-cp312-win_amd64.whl (2449.3 MB)
   

In [17]:
model = AutoModelForQuestionAnswering.from_pretrained(save_path)
tokenizer = AutoTokenizer.from_pretrained(save_path)
qa_pipeline = pipeline("question-answering", model = model, tokenizer = tokenizer)

Device set to use cuda:0


In [22]:
y_true = []
y_pred = []

for example in tqdm(test_dataset):
    context = example["context"]
    question = example["question"]
    true_answer = example["answers"]["text"][0]  # gold answer

    prediction = qa_pipeline(question=question, context=context)

    y_true.append(true_answer)
    y_pred.append(prediction["answer"])

  0%|                                                                                         | 0/9817 [00:00<?, ?it/s]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|█████████████████████████████████████████████████████████████████████████████| 9817/9817 [01:12<00:00, 136.13it/s]


In [23]:
evaluate_qa(y_true, y_pred)

Exact Match (EM):  0.6609
F1 Score:  0.8022


(np.float64(0.660894366914536), np.float64(0.8022170270336673))