In [2]:
from transformers import pipeline, AutoTokenizer, AutoModelForTokenClassification
import requests
import re

# 1. Load BioBERT model and tokenizer (choose your variant)
model_name = "dmis-lab/biobert-v1.1"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForTokenClassification.from_pretrained(model_name)

Some weights of BertForTokenClassification were not initialized from the model checkpoint at dmis-lab/biobert-v1.1 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [3]:
ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer)

Device set to use mps:0


In [4]:
text = "The patient was treated with Aspirin for Hypertension.  The EGFR gene was also analyzed."

In [11]:
ner_pipeline(text)

[{'entity': 'LABEL_0',
  'score': np.float32(0.50278854),
  'index': 1,
  'word': 'The',
  'start': 0,
  'end': 3},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5383096),
  'index': 2,
  'word': 'patient',
  'start': 4,
  'end': 11},
 {'entity': 'LABEL_1',
  'score': np.float32(0.54741913),
  'index': 3,
  'word': 'was',
  'start': 12,
  'end': 15},
 {'entity': 'LABEL_1',
  'score': np.float32(0.5702383),
  'index': 4,
  'word': 'treated',
  'start': 16,
  'end': 23},
 {'entity': 'LABEL_1',
  'score': np.float32(0.59693295),
  'index': 5,
  'word': 'with',
  'start': 24,
  'end': 28},
 {'entity': 'LABEL_1',
  'score': np.float32(0.5077557),
  'index': 6,
  'word': 'As',
  'start': 29,
  'end': 31},
 {'entity': 'LABEL_1',
  'score': np.float32(0.55134845),
  'index': 7,
  'word': '##pi',
  'start': 31,
  'end': 33},
 {'entity': 'LABEL_0',
  'score': np.float32(0.5070949),
  'index': 8,
  'word': '##rin',
  'start': 33,
  'end': 36},
 {'entity': 'LABEL_1',
  'score': np.float32(0.50496

In [5]:
ner_results = ner_pipeline(text)


In [9]:
for nr in ner_results:
    print(nr)

{'entity': 'LABEL_0', 'score': np.float32(0.50278854), 'index': 1, 'word': 'The', 'start': 0, 'end': 3}
{'entity': 'LABEL_0', 'score': np.float32(0.5383096), 'index': 2, 'word': 'patient', 'start': 4, 'end': 11}
{'entity': 'LABEL_1', 'score': np.float32(0.54741913), 'index': 3, 'word': 'was', 'start': 12, 'end': 15}
{'entity': 'LABEL_1', 'score': np.float32(0.5702383), 'index': 4, 'word': 'treated', 'start': 16, 'end': 23}
{'entity': 'LABEL_1', 'score': np.float32(0.59693295), 'index': 5, 'word': 'with', 'start': 24, 'end': 28}
{'entity': 'LABEL_1', 'score': np.float32(0.5077557), 'index': 6, 'word': 'As', 'start': 29, 'end': 31}
{'entity': 'LABEL_1', 'score': np.float32(0.55134845), 'index': 7, 'word': '##pi', 'start': 31, 'end': 33}
{'entity': 'LABEL_0', 'score': np.float32(0.5070949), 'index': 8, 'word': '##rin', 'start': 33, 'end': 36}
{'entity': 'LABEL_1', 'score': np.float32(0.50496906), 'index': 9, 'word': 'for', 'start': 37, 'end': 40}
{'entity': 'LABEL_1', 'score': np.float32(

In [None]:


# 2. Text to annotate
text = "The patient was treated with Aspirin for Hypertension.  The EGFR gene was also analyzed."

# 3. NER with BioBERT
ner_results = ner_pipeline(text)

# 4. Entity Linking with BioPortal (improved)
def map_to_ontology(entity_name, ontology_id="MeSH"):
    url = f"https://bioportal.bioontology.org/ontologies/{ontology_id}/search"
    params = {"q": entity_name, "apikey": "YOUR_BIOPORTAL_API_KEY"}  # **REPLACE with your API key**
    try:
        response = requests.get(url, params=params)
        response.raise_for_status()  # Raise an exception for bad status codes (4xx or 5xx)
        results = response.json()
        if results and results["collection"]:
            best_match = results["collection"][0]
            # Improved matching (consider synonyms, definitions, etc.)
            # For now, just return the ID:
            return best_match["@id"]
        else:
            return None  # No match found
    except requests.exceptions.RequestException as e:
        print(f"Error querying BioPortal: {e}")
        return None

# 5. Process and link the entities
entities_to_link = []

for entity in ner_results:
    entity_type = entity["entity_group"]
    entity_text = entity["word"]

    # Clean up entity text (remove extra spaces, etc.)
    entity_text = re.sub(r"\s+", " ", entity_text).strip()  # Normalize whitespace

    if entity_type in ("GENE", "DISEASE", "DRUG"):  # Filter for relevant types
        entities_to_link.append({"text": entity_text, "type": entity_type})

for entity_info in entities_to_link:
    concept_id = map_to_ontology(entity_info["text"], "MeSH")  # You can change ontology here
    print(f"Entity: {entity_info['text']}, Type: {entity_info['type']}, Concept: {concept_id}")

# Example of further processing or output:
annotations = []
for entity in ner_results:
    if entity["entity_group"] in ("GENE", "DISEASE", "DRUG"):
        concept_uri = map_to_ontology(entity["word"], "MeSH")
        annotations.append({
            "start": entity["start"],
            "end": entity["end"],
            "text": entity["word"],
            "type": entity["entity_group"],
            "concept_uri": concept_uri
        })

print("\nAnnotations (with character offsets and concept URIs):")
print(annotations)