### Pipeline & helper

In [3]:
import torch
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [4]:
# MODEL_PATH = "../models/bert_ner_baseline_v1"
MODEL_PATH = "../outputs/models/biobert_ner_baseline_v1"


In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

In [6]:
   ner_pipeline = pipeline("token-classification", 
                            model=model,
                            tokenizer=tokenizer, 
                            device= 0 if torch.cuda.is_available() else -1, 
                            aggregation_strategy="simple")

Device set to use cpu


In [8]:


text = '''
Case studies highlight BioBERT’s transformative role in biomedical research. In oncology,
BioBERT-based models have been applied to classify clinical trial reports, extracting treatment
outcomes and adverse effects for drugs like Pembrolizumab and Nivolumab in melanoma patients.
Similarly, cardiology research has leveraged BioBERT to identify biomarkers such as Troponin levels
associated with myocardial infarction, improving early diagnostic pipelines. In infectious disease
research, BioBERT assists in rapidly mining literature during outbreaks. During the COVID-19
pandemic, BioBERT-enabled pipelines processed thousands of PubMed abstracts daily, surfacing
relevant insights about cytokine storms, SARS-CoV-2 spike proteins, and potential therapeutic targets.
This accelerated both vaccine development and clinical response. Another area of growth is
pharmacovigilance, where BioBERT extracts signals of adverse drug reactions from clinical notes and
patient forums. Detecting associations like “Ibuprofen use” linked with “gastric ulcer” enables earlier
safety interventions. Integrating these insights with regulatory databases strengthens drug safety
monitoring. By continuously adapting to emerging biomedical knowledge, BioBERT acts as an evolving
bridge between natural language understanding and real-world medical applications, ensuring
researchers and clinicians can stay ahead in precision healthcare.
'''
ner_results = ner_pipeline(text)  # spans with entity_group, score, start, end

In [9]:
ner_results

[{'entity_group': 'CHEMICAL',
  'score': 0.9990281,
  'word': 'P',
  'start': 231,
  'end': 232},
 {'entity_group': 'CHEMICAL',
  'score': 0.9787481,
  'word': '##em',
  'start': 232,
  'end': 234},
 {'entity_group': 'CHEMICAL',
  'score': 0.9873406,
  'word': '##bro',
  'start': 234,
  'end': 237},
 {'entity_group': 'CHEMICAL',
  'score': 0.8762808,
  'word': '##li',
  'start': 237,
  'end': 239},
 {'entity_group': 'CHEMICAL',
  'score': 0.7162142,
  'word': '##zu',
  'start': 239,
  'end': 241},
 {'entity_group': 'CHEMICAL',
  'score': 0.65014315,
  'word': '##ma',
  'start': 241,
  'end': 243},
 {'entity_group': 'CHEMICAL',
  'score': 0.86017895,
  'word': '##b',
  'start': 243,
  'end': 244},
 {'entity_group': 'CHEMICAL',
  'score': 0.9992848,
  'word': 'Ni',
  'start': 249,
  'end': 251},
 {'entity_group': 'CHEMICAL',
  'score': 0.98208815,
  'word': '##vo',
  'start': 251,
  'end': 253},
 {'entity_group': 'CHEMICAL',
  'score': 0.76447153,
  'word': '##lum',
  'start': 253,
  'en

#### Since some entity still starts with ##.I am fixing such tokens with using fix_subword_tokens functions

In [10]:
def fix_subword_tokens(text,ner_results):
    aggregated = []
    current_entity = None
    
    for token in ner_results:
        if token['word'].startswith('##'):
            if current_entity and current_entity['entity_group'] == token['entity_group']:
                current_entity['word'] += token['word'][2:]  # Remove ##
                current_entity['end'] = token['end']
                current_entity['score'] = max(current_entity['score'], token['score'])
        else:
            if current_entity:
                aggregated.append(current_entity)
            current_entity = token.copy()
    
    
    if current_entity:
        aggregated.append(current_entity)
    
    return aggregated


In [11]:
def ner_predict(text):
    raw = ner_pipeline(text)    # token-level results

    cleaned_ner = fix_subword_tokens(text, raw) # fixing  subword tokens
    return cleaned_ner

In [12]:
ner_predict(text)


[{'entity_group': 'CHEMICAL',
  'score': 0.9990281,
  'word': 'Pembrolizumab',
  'start': 231,
  'end': 244},
 {'entity_group': 'CHEMICAL',
  'score': 0.9992848,
  'word': 'Nivolumab',
  'start': 249,
  'end': 258},
 {'entity_group': 'DISEASE',
  'score': 0.9980812,
  'word': 'melanoma',
  'start': 262,
  'end': 270},
 {'entity_group': 'CHEMICAL',
  'score': 0.9096547,
  'word': 'Tonin',
  'start': 365,
  'end': 373},
 {'entity_group': 'DISEASE',
  'score': 0.9994229,
  'word': 'myocardial',
  'start': 397,
  'end': 407},
 {'entity_group': 'DISEASE',
  'score': 0.75484145,
  'word': 'infarction',
  'start': 408,
  'end': 418},
 {'entity_group': 'DISEASE',
  'score': 0.7537574,
  'word': 'infectious disease',
  'start': 461,
  'end': 479},
 {'entity_group': 'DISEASE',
  'score': 0.83384234,
  'word': 'adverse drug reactions',
  'start': 913,
  'end': 935},
 {'entity_group': 'CHEMICAL',
  'score': 0.9840911,
  'word': 'Ibuprofen',
  'start': 1005,
  'end': 1014},
 {'entity_group': 'DISEA

#### Real time interaction with gradio

In [15]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("Biomedical NER Demo")
    inp = gr.Textbox(lines=3, placeholder="Type biomedical text here...")
    btn = gr.Button("Predict")

    out_json = gr.JSON(label="Entities")
    btn.click(ner_predict, inputs=inp, outputs=out_json)

demo.launch(share=False)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


