In [7]:
!pip install transformers datasets evaluate seqeval --quiet

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.6/43.6 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.4/491.4 kB[0m [31m8.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.0/84.0 kB[0m [31m5.9 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m7.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m12.4 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m7.1 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.8/194.8 kB[0m [31m7.6 MB/s[0m eta [36m

Prepare DataSet

In [8]:
# Import libraries
import re
from collections import defaultdict

# Function to parse PubTator file
def parse_pubtator_file(filepath):
    data = []
    annotations = defaultdict(list)
    with open(filepath, 'r') as f:
        lines = f.readlines()

    current_text = ""
    current_pm_id = ""

    for line in lines:
        line = line.strip()
        if not line:
            continue

        if "|t|" in line or "|a|" in line:
            parts = line.split("|")
            pmid = parts[0]
            text = parts[2]
            if "|t|" in line:
                current_text = text
                current_pm_id = pmid
            else:
                current_text += " " + text
        else:
            parts = line.split("\t")
            if len(parts) >= 6:
                start, end, mention, entity_type, entity_id = int(parts[1]), int(parts[2]), parts[3], parts[4], parts[5]
                entity_type = entity_type.upper()
                annotations[current_pm_id].append((start, end, mention, entity_type))

        if line == lines[-1] or "|a|" in line:
            if current_pm_id and current_text:
                data.append({
                    "pmid": current_pm_id,
                    "text": current_text,
                    "annotations": annotations[current_pm_id]
                })
    return data

In [9]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Load DataSet

In [10]:
# Load your dataset
data_path = '/content/drive/MyDrive/DL/CDR_Data/CDR.Corpus.v010516/CDR_TrainingSet.PubTator.txt'  # Update to your correct path
dataset = parse_pubtator_file(data_path)

Build tokens and BIO labels

In [11]:
# Build tokens and BIO labels
label_list = ["O", "B-CHEMICAL", "I-CHEMICAL", "B-DISEASE", "I-DISEASE"]
label2id = {label: i for i, label in enumerate(label_list)}
id2label = {i: label for label, i in label2id.items()}

def prepare_ner_dataset(parsed_data):
    tokenized_texts = []
    labels = []

    for item in parsed_data:
        text = item['text']
        entities = item['annotations']
        tokens = text.split()
        label_seq = ['O'] * len(tokens)

        for start, end, mention, entity_type in entities:
            entity_type = entity_type.upper()
            for idx, token in enumerate(tokens):
                if mention in token:
                    label_seq[idx] = "B-" + entity_type if label_seq[idx] == 'O' else "I-" + entity_type

        tokenized_texts.append(tokens)
        labels.append([label2id[label] for label in label_seq])

    return tokenized_texts, labels

tokens, ner_labels = prepare_ner_dataset(dataset)

print("Example tokens:", tokens[0])
print("Example labels:", ner_labels[0])

Example tokens: ['Naloxone', 'reverses', 'the', 'antihypertensive', 'effect', 'of', 'clonidine.', 'In', 'unanesthetized,', 'spontaneously', 'hypertensive', 'rats', 'the', 'decrease', 'in', 'blood', 'pressure', 'and', 'heart', 'rate', 'produced', 'by', 'intravenous', 'clonidine,', '5', 'to', '20', 'micrograms/kg,', 'was', 'inhibited', 'or', 'reversed', 'by', 'nalozone,', '0.2', 'to', '2', 'mg/kg.', 'The', 'hypotensive', 'effect', 'of', '100', 'mg/kg', 'alpha-methyldopa', 'was', 'also', 'partially', 'reversed', 'by', 'naloxone.', 'Naloxone', 'alone', 'did', 'not', 'affect', 'either', 'blood', 'pressure', 'or', 'heart', 'rate.', 'In', 'brain', 'membranes', 'from', 'spontaneously', 'hypertensive', 'rats', 'clonidine,', '10(-8)', 'to', '10(-5)', 'M,', 'did', 'not', 'influence', 'stereoselective', 'binding', 'of', '[3H]-naloxone', '(8', 'nM),', 'and', 'naloxone,', '10(-8)', 'to', '10(-4)', 'M,', 'did', 'not', 'influence', 'clonidine-suppressible', 'binding', 'of', '[3H]-dihydroergocryptine',

Eval(Evaluation) dataset

In [None]:
from sklearn.model_selection import train_test_split
from datasets import Dataset

# 1. Split into training and evaluation
train_tokens, eval_tokens, train_labels, eval_labels = train_test_split(
    tokens, ner_labels, test_size=0.2, random_state=42
)

# 2. Create Hugging Face Dataset objects
train_dataset = Dataset.from_dict({
    'tokens': train_tokens,
    'ner_tags': train_labels
})

eval_dataset = Dataset.from_dict({
    'tokens': eval_tokens,
    'ner_tags': eval_labels
})

Load Dataset and Tokenizer

In [12]:
from datasets import Dataset
from transformers import AutoTokenizer

# Build Hugging Face dataset
train_dataset = Dataset.from_dict({
    'tokens': tokens,
    'ner_tags': ner_labels
})

# Load BioBERT tokenizer
model_name = "dmis-lab/biobert-base-cased-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Tokenize and align labels
def tokenize_and_align_labels(examples):
    tokenized_inputs = tokenizer(
        examples['tokens'],
        padding="max_length",    # 🔥 pad exactly to 512
        truncation=True,         # 🔥 truncate anything above 512
        max_length=512,          # 🔥 force BERT max size
        is_split_into_words=True,
        return_tensors=None
    )
    labels = []

    for i, label in enumerate(examples['ner_tags']):
        word_ids = tokenized_inputs.word_ids(batch_index=i)
        label_ids = []
        previous_word_idx = None
        for word_idx in word_ids:
            if word_idx is None:
                label_ids.append(-100)
            elif word_idx != previous_word_idx:
                label_ids.append(label[word_idx])
            else:
                label_ids.append(label[word_idx])
            previous_word_idx = word_idx
        labels.append(label_ids)

    tokenized_inputs["labels"] = labels
    return tokenized_inputs

# Apply tokenizer
train_dataset = train_dataset.map(tokenize_and_align_labels, batched=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/313 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Load Model and Set Trainer

In [13]:
#from transformers import AutoModelForTokenClassification, TrainingArguments, Trainer
#import evaluate
#import numpy as np

from transformers import AutoTokenizer, AutoModelForTokenClassification, TrainingArguments, Trainer
from datasets import Dataset
import evaluate
import os

# Load BioBERT model
model = AutoModelForTokenClassification.from_pretrained(
    model_name,
    num_labels=len(label_list),
    id2label=id2label,
    label2id=label2id
)

# Define evaluation metric
metric = evaluate.load("seqeval")

def compute_metrics(p):
    predictions, labels = p
    predictions = np.argmax(predictions, axis=2)

    true_labels = [[label_list[l] for l in label if l != -100] for label in labels]
    true_predictions = [
        [label_list[p] for (p, l) in zip(prediction, label) if l != -100]
        for prediction, label in zip(predictions, labels)
    ]

    results = metric.compute(predictions=true_predictions, references=true_labels)
    return {
        "precision": results["overall_precision"],
        "recall": results["overall_recall"],
        "f1": results["overall_f1"],
        "accuracy": results["overall_accuracy"],
    }

# Define training arguments
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    learning_rate=2e-5,
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=1,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none"   # 🔥 NEW WAY TO DISABLE wandb
)

pytorch_model.bin:   0%|          | 0.00/436M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/436M [00:00<?, ?B/s]

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-base-cased-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Downloading builder script:   0%|          | 0.00/6.34k [00:00<?, ?B/s]

In [15]:
!pip install wandb --quiet
import os

#os.environ["WANDB_DISABLED"] = "true"

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    #eval_dataset=eval_dataset,  # if available
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)




  trainer = Trainer(


In [16]:
# Start training
trainer.train()

Step,Training Loss
10,1.1001
20,0.6604
30,0.4846
40,0.4137
50,0.358
60,0.3594


TrainOutput(global_step=63, training_loss=0.5519545778395638, metrics={'train_runtime': 2910.8217, 'train_samples_per_second': 0.172, 'train_steps_per_second': 0.022, 'total_flos': 130651921920000.0, 'train_loss': 0.5519545778395638, 'epoch': 1.0})

In [17]:

model.save_pretrained("./saved_biobert_ner")
tokenizer.save_pretrained("./saved_biobert_ner")
print("✅ Model and tokenizer saved successfully!")

✅ Model and tokenizer saved successfully!


In [18]:
from transformers import AutoModelForTokenClassification, AutoTokenizer

model = AutoModelForTokenClassification.from_pretrained("./saved_biobert_ner")
tokenizer = AutoTokenizer.from_pretrained("./saved_biobert_ner")

In [19]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [20]:
!cp -r /content/saved_biobert_ner /content/drive/MyDrive/

In [21]:
!ls /content/drive/MyDrive/saved_biobert_ner


config.json	   special_tokens_map.json  tokenizer.json
model.safetensors  tokenizer_config.json    vocab.txt


Prediction

In [22]:
import torch

id2label = {
    0: "O",
    1: "B-CHEMICAL",
    2: "I-CHEMICAL",
    3: "B-DISEASE",
    4: "I-DISEASE"
}

def predict_entities(sentence):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, max_length=512)
    with torch.no_grad():
        outputs = model(**inputs)
    predictions = torch.argmax(outputs.logits, dim=2)
    tokens = tokenizer.convert_ids_to_tokens(inputs["input_ids"][0])
    labels = [id2label[p.item()] for p in predictions[0]]

    for token, label in zip(tokens, labels):
        if label != "O":
            print(f"{token}: {label}")

# Example
predict_entities("Paracetamol is used to treat fever and headache in patients.")

para: I-CHEMICAL
##ce: B-CHEMICAL
##tam: I-CHEMICAL
##ol: I-CHEMICAL


Streamlit App

In [23]:
!zip -r saved_biobert_ner.zip saved_biobert_ner

  adding: saved_biobert_ner/ (stored 0%)
  adding: saved_biobert_ner/tokenizer.json (deflated 70%)
  adding: saved_biobert_ner/model.safetensors (deflated 7%)
  adding: saved_biobert_ner/vocab.txt (deflated 49%)
  adding: saved_biobert_ner/tokenizer_config.json (deflated 74%)
  adding: saved_biobert_ner/config.json (deflated 52%)
  adding: saved_biobert_ner/special_tokens_map.json (deflated 42%)


In [24]:
from google.colab import files
files.download('saved_biobert_ner.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>