### Pipeline & helper

In [2]:
from dataclasses import dataclass
from typing import List, Dict, Any
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

In [3]:
MODEL_PATH = "../models/bert_ner_baseline_v1"

In [4]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH, use_fast=True)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

In [5]:
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
model = AutoModelForTokenClassification.from_pretrained(MODEL_PATH)

ner_pipeline = pipeline(
                        "ner", 
                        model=model, 
                        tokenizer=tokenizer, 
                        aggregation_strategy="simple"
                        )
text = "Aspirin can cause gastric bleeding in some patients with T2D."

ner_results = ner_pipeline(text)  # spans with entity_group, score, start, end

Device set to use cpu


In [12]:
ner_results

[{'entity_group': 'CHEMICAL',
  'score': 0.9903351,
  'word': 'As',
  'start': 0,
  'end': 2},
 {'entity_group': 'CHEMICAL',
  'score': 0.8459496,
  'word': '##pi',
  'start': 2,
  'end': 4},
 {'entity_group': 'CHEMICAL',
  'score': 0.84560657,
  'word': '##rin',
  'start': 4,
  'end': 7},
 {'entity_group': 'DISEASE',
  'score': 0.83817285,
  'word': 'gastric bleeding',
  'start': 18,
  'end': 34},
 {'entity_group': 'DISEASE',
  'score': 0.74497277,
  'word': 'T',
  'start': 57,
  'end': 58}]

#### Since some entity still starts with ##.I am fixing such tokens with using fix_subword_tokens functions

In [7]:
def fix_subword_tokens(text,ner_results):
    aggregated = []
    current_entity = None
    
    for token in ner_results:
        if token['word'].startswith('##'):
            if current_entity and current_entity['entity_group'] == token['entity_group']:
                current_entity['word'] += token['word'][2:]  # Remove ##
                current_entity['end'] = token['end']
                current_entity['score'] = max(current_entity['score'], token['score'])
        else:
            if current_entity:
                aggregated.append(current_entity)
            current_entity = token.copy()
    
    if current_entity:
        aggregated.append(current_entity)
    
    return aggregated


In [8]:
cleaned_ner_results = fix_subword_tokens(text,ner_results)


In [9]:
cleaned_ner_results

[{'entity_group': 'CHEMICAL',
  'score': 0.9903351,
  'word': 'Aspirin',
  'start': 0,
  'end': 7},
 {'entity_group': 'DISEASE',
  'score': 0.83817285,
  'word': 'gastric bleeding',
  'start': 18,
  'end': 34},
 {'entity_group': 'DISEASE',
  'score': 0.74497277,
  'word': 'T',
  'start': 57,
  'end': 58}]

In [10]:
def ner_predict(text):
    raw = ner_pipeline(text)    # token-level results
    cleaned = fix_subword_tokens(text, raw) # fixing  subword tokens
    return cleaned

#### Real time interaction with gradio

In [11]:
import gradio as gr

with gr.Blocks() as demo:
    gr.Markdown("Biomedical NER Demo")
    inp = gr.Textbox(lines=3, placeholder="Type biomedical text here...")
    btn = gr.Button("Predict")

    out_json = gr.JSON(label="Entities")
    btn.click(ner_predict, inputs=inp, outputs=out_json)

demo.launch(share=False)

* Running on local URL:  http://127.0.0.1:7860
* To create a public link, set `share=True` in `launch()`.


