In [None]:

import json
from sentence_transformers import SentenceTransformer, CrossEncoder, util
import torch
import spacy
import numpy as np


In [2]:
import json

def load_hpo_data(file_path="hp.json"):
    """
    Loads and processes the hp.json file to create a list of HPO term documents.
    """
    with open(file_path, 'r') as f:
        data = json.load(f)

    hpo_terms = []
    nodes = data.get("graphs", [{}])[0].get("nodes", [])

    for node in nodes:
        # We only care about nodes that are HPO terms (have 'HP_' in their ID)
        if 'id' in node and 'HP_' in node['id']:
            term_id = node['id'].split('/')[-1].replace('_', ':')
            label = node.get('lbl', '')
            
            definition = ""
            synonyms = []

            # Extract definition and synonyms from the 'meta' field
            if 'meta' in node:
                if 'definition' in node['meta'].keys():
                    definition = node['meta']['definition']['val']
                        
                synonyms_list = node['meta'].get('synonyms', [])
                for synonym in synonyms_list:
                    synonyms.append(synonym.get('val', ''))

            # Create a combined text document for searching
            combined_text = f"{label}. {definition} {'. '.join(synonyms)}"
            
            hpo_terms.append({
                "id": term_id,
                "label": label,
                "definition": definition,
                "synonyms": synonyms,
                "combined_text": combined_text
            })
            
    return hpo_terms

def find_hpo_terms_bykeyworks(keywords, hpo_embeddings, model, top_k=4):
    """
    Finds the best matching HPO terms for keywords using semantic similarity.
    """

    # 2. Create embeddings for the input keywords
    keyword_embeddings = model.encode(keywords, convert_to_tensor=True)

    # 3. Compute cosine similarity between keywords and all HPO terms
    # This finds the 'top_k' most similar HPO terms for each keyword
    cosine_scores = util.cos_sim(keyword_embeddings, hpo_embeddings)
    
    # top_k_results is a list of lists, one for each keyword
    top_k_results = torch.topk(cosine_scores, k=top_k, dim=-1)

    results = {}
    for i, keyword in enumerate(keywords):
        matched_terms = []
        for score, idx in zip(top_k_results.values[i], top_k_results.indices[i]):
            matched_term = hpo_data[idx]
            matched_terms.append({
                "id": matched_term['id'],
                "label": matched_term['label'],
                "score": score.item()
            })
        results[keyword] = matched_terms
        
    return results

def find_hpo_terms_byparagraph(paragraph, entries, embeddings, model, top_k=10):
    """Return top-k matching HPO terms for a clinical paragraph."""
    query_emb = model.encode(paragraph, convert_to_tensor=True)
    hits = util.semantic_search(query_emb, embeddings, top_k=top_k)[0]

    results = []
    for hit in hits:
        score = float(hit["score"])
        results.append((entries[hit["corpus_id"]]["id"], entries[hit["corpus_id"]]["label"], score))

    return results


# Load and embed the HPO terms

In [3]:
# Load the data
hpo_data = load_hpo_data("hp.json")
print(f"Loaded {len(hpo_data)} HPO terms.")

# Example of a loaded term
print("\n--- Example Term ---")
for term in hpo_data:
    if term['id'] == 'HP:0003194': # Muscle cramp
        print(json.dumps(term, indent=2))
        break

Loaded 19726 HPO terms.

--- Example Term ---
{
  "id": "HP:0003194",
  "label": "Short nasal bridge",
  "definition": "Decreased superior-inferior length of the nasal bridge, which is the saddle-shaped area that includes the nasal root and the lateral aspects of the nose.",
  "synonyms": [
    "Decreased length of bridge of nose",
    "Decreased length of nasal bridge",
    "Short bridge of nose",
    "Short nasal bridge"
  ],
  "combined_text": "Short nasal bridge. Decreased superior-inferior length of the nasal bridge, which is the saddle-shaped area that includes the nasal root and the lateral aspects of the nose. Decreased length of bridge of nose. Decreased length of nasal bridge. Short bridge of nose. Short nasal bridge"
}


In [4]:
model = SentenceTransformer("sentence-transformers/embeddinggemma-300m-medical")


You are trying to use a model that was created with Sentence Transformers version 5.2.0.dev0, but you're currently using version 5.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.


In [None]:
print("Creating embeddings for HPO terms...")
hpo_texts = [term['combined_text'] for term in hpo_data]
hpo_combinedtext_embeddings = model.encode(hpo_texts, convert_to_tensor=True, show_progress_bar=True)

You are trying to use a model that was created with Sentence Transformers version 5.2.0.dev0, but you're currently using version 5.1.1. This might cause unexpected behavior or errors. In that case, try to update to the latest version.


Creating embeddings for HPO terms...


Batches: 100%|██████████| 617/617 [38:14<00:00,  3.72s/it] 


In [5]:
#torch.save(hpo_combinedtext_embeddings, "hpo_combinedtext_embeddings.pt")
hpo_combinedtext_embeddings = torch.load("hpotermdescription_embeddings_gemma300mmedical.pt")


In [23]:
print("Creating embeddings for HPO terms...")
hpo_labels = [term['label'] for term in hpo_data]
hpo_label_embeddings = model.encode(hpo_labels, convert_to_tensor=True, show_progress_bar=True)

Creating embeddings for HPO terms...


Batches: 100%|██████████| 617/617 [13:57<00:00,  1.36s/it]


In [24]:
torch.save(hpo_label_embeddings, "hpoterm_embeddings_gemma300mmedical.pt")


# Extract key words and perform sematic search

In [9]:
# Load the small scispaCy model
# This model is trained on biomedical text and is good for entity detection.
nlp = spacy.load("./en_core_sci_md-0.5.4/en_core_sci_md/en_core_sci_md-0.5.4")

paragraph = "The patient experienced progressive muscle cramps, paresis, and wasting beginning in one leg and progressing to the other"

# Process the paragraph with the scispaCy model
doc = nlp(paragraph)

# Extract entities (medical terms)
extracted_terms = [ent.text for ent in doc.ents]

print(f"Original Paragraph: '{paragraph}'")
print(f"Extracted Medical Terms: {extracted_terms}")

Original Paragraph: 'The patient experienced progressive muscle cramps, paresis, and wasting beginning in one leg and progressing to the other'
Extracted Medical Terms: ['patient', 'progressive', 'muscle cramps', 'paresis', 'wasting', 'leg', 'progressing']


In [31]:
# --- Run the semantic search ---

print("\n--- Semantic Search Results ---")
semantic_results = find_hpo_terms_bykeyworks(extracted_terms, hpo_label_embeddings, model)
print(json.dumps(semantic_results, indent=2))


--- Semantic Search Results ---
{
  "patient": [
    {
      "id": "HP:0011510",
      "label": "Drusen",
      "score": 0.9044458866119385
    },
    {
      "id": "HP:0012834",
      "label": "Right",
      "score": 0.9007444977760315
    },
    {
      "id": "HP:0012835",
      "label": "Left",
      "score": 0.8988599181175232
    },
    {
      "id": "HP:0032322",
      "label": "Healthy",
      "score": 0.8930355906486511
    }
  ],
  "progressive": [
    {
      "id": "HP:0003680",
      "label": "Nonprogressive",
      "score": 0.9526521563529968
    },
    {
      "id": "HP:0012835",
      "label": "Left",
      "score": 0.9334932565689087
    },
    {
      "id": "HP:0003676",
      "label": "Progressive",
      "score": 0.9278656244277954
    },
    {
      "id": "HP:0012834",
      "label": "Right",
      "score": 0.9247534275054932
    }
  ],
  "muscle cramps": [
    {
      "id": "HP:0003449",
      "label": "Cold-induced muscle cramps",
      "score": 0.9104836583137512

In [None]:
# --- Run the semantic search ---

print("\n--- Semantic Search Results ---")
semantic_results = find_hpo_terms_bykeyworks(extracted_terms, hpo_combinedtext_embeddings, model)
print(json.dumps(semantic_results, indent=2))


--- Semantic Search Results ---
{
  "patient": [
    {
      "id": "HP:0000001",
      "label": "All",
      "score": 0.8601305484771729
    },
    {
      "id": "HP:0001050",
      "label": "Plethora",
      "score": 0.8398342132568359
    },
    {
      "id": "HP:0003989",
      "label": "Notched ulna",
      "score": 0.8355245590209961
    },
    {
      "id": "HP:0003990",
      "label": "Pointed ulna",
      "score": 0.8233039379119873
    }
  ],
  "progressive": [
    {
      "id": "HP:0000001",
      "label": "All",
      "score": 0.8942227363586426
    },
    {
      "id": "HP:0001050",
      "label": "Plethora",
      "score": 0.8676424622535706
    },
    {
      "id": "HP:0003990",
      "label": "Pointed ulna",
      "score": 0.8571158647537231
    },
    {
      "id": "HP:0003989",
      "label": "Notched ulna",
      "score": 0.851737916469574
    }
  ],
  "muscle cramps": [
    {
      "id": "HP:0003435",
      "label": "Cold-induced hand cramps",
      "score": 0.82973

# Direct sematic search

In [27]:
find_hpo_terms_byparagraph("The patient experienced progressive muscle cramps, paresis, and wasting beginning in one leg and progressing to the other", hpo_data, hpo_label_embeddings, model, top_k=15)


[('HP:0011964', 'Intermittent painful muscle spasms', 0.7256890535354614),
 ('HP:0003449', 'Cold-induced muscle cramps', 0.7083907723426819),
 ('HP:0003710', 'Exercise-induced muscle cramps', 0.6930332183837891),
 ('HP:0008991', 'Exercise-induced leg cramps', 0.6804571151733398),
 ('HP:0009028', 'Generalized weakness of limb muscles', 0.6717520356178284),
 ('HP:0003394', 'Muscle spasm', 0.65623939037323),
 ('HP:0003435', 'Cold-induced hand cramps', 0.6464894413948059),
 ('HP:0030200',
  'Fatiguable weakness of proximal limb muscles',
  0.6364095211029053),
 ('HP:0031988', 'obsolete Muscle spasm', 0.6358648538589478),
 ('HP:0008994', 'Proximal muscle weakness in lower limbs', 0.6330800652503967),
 ('HP:0007098', 'Paroxysmal choreoathetosis', 0.6318628787994385),
 ('HP:0007166', 'Paroxysmal dyskinesia', 0.6299142241477966),
 ('HP:0030198',
  'Fatigable weakness of distal limb muscles',
  0.6261293888092041),
 ('HP:0003324', 'Generalized muscle weakness', 0.6261209845542908),
 ('HP:003175

In [18]:
find_hpo_terms_byparagraph("The patient experienced progressive muscle cramps, paresis, and wasting beginning in one leg and progressing to the other", hpo_data, hpo_combinedtext_embeddings, model, top_k=15)


[('HP:0008991', 'Exercise-induced leg cramps', 0.8124040961265564),
 ('HP:0003394', 'Muscle spasm', 0.784591019153595),
 ('HP:0011964', 'Intermittent painful muscle spasms', 0.7735278606414795),
 ('HP:0003710', 'Exercise-induced muscle cramps', 0.7723671197891235),
 ('HP:0003752', 'Episodic flaccid weakness', 0.7366074919700623),
 ('HP:0032155', 'Abdominal cramps', 0.7193395495414734),
 ('HP:0003435', 'Cold-induced hand cramps', 0.6904074549674988),
 ('HP:0031989', 'Perioral spasm', 0.6882120370864868),
 ('HP:0001264', 'Spastic diplegia', 0.6840314269065857),
 ('HP:0031988', 'obsolete Muscle spasm', 0.6814048290252686),
 ('HP:0031959', 'Leg dystonia', 0.679900586605072),
 ('HP:0009063', 'Progressive distal muscle weakness', 0.6756155490875244),
 ('HP:0003552', 'Muscle stiffness', 0.6689462661743164),
 ('HP:0003323', 'Progressive muscle weakness', 0.6657924056053162),
 ('HP:0009028', 'Generalized weakness of limb muscles', 0.6656557321548462)]

In [6]:
whole_paragragh="at the age of 22, 5 years this German woman experienced muscle cramps, progressive paresis and muscle wasting beginning in one leg and spreading to the other leg without sensory, autonomic or cognitive symptoms. A myopathy was initially suspected but EMG revealed acute and chronic signs of denervation mostly in the legs but also slightly in the upper limbs. Transcortical magnetic evoked potential (MEP) examination was pathological with delayed central conductance time to both lower limbs but not to the upper limbs. Peripheral nerve conduction studies were normal. The medical history was unremarkable except for a voluntarily weight loss of ≈10 kg in the years before appearance of muscle symptoms. Both the parents have diabetes mellitus type 2, but there is no family history of a neuromuscular disease (NMD) or FTD-like condition. The patient eventually received a diagnosis of sALS. The disease has developed slowly, symptoms and signs of upper motor neuron (UMN) and lower motor neuron (LMN) damage only appearing in the upper limbs 3 years after onset in the lower limbs. Nine years after onset of paresis, the patient is alive and still show no bulbar symptoms."

find_hpo_terms_byparagraph(whole_paragragh, hpo_data, hpo_combinedtext_embeddings, model, top_k=15)


[('HP:0003752', 'Episodic flaccid weakness', 0.44257399439811707),
 ('HP:0003324', 'Generalized muscle weakness', 0.44253167510032654),
 ('HP:0003700', 'Generalized amyotrophy', 0.43646830320358276),
 ('HP:0003458', 'EMG: myopathic abnormalities', 0.4346015751361847),
 ('HP:0009055', 'Generalized limb muscle atrophy', 0.43245625495910645),
 ('HP:0009786',
  'Aplasia/Hypoplasia of the musculature of the thigh',
  0.4285629391670227),
 ('HP:0009128',
  'Aplasia/Hypoplasia involving the musculature of the extremities',
  0.42566972970962524),
 ('HP:0010550', 'Paraplegia', 0.4250236749649048),
 ('HP:0011099', 'Spastic hemiparesis', 0.42474237084388733),
 ('HP:0012898',
  'Abnormal lower-limb motor evoked potentials',
  0.4233229160308838),
 ('HP:0003011', 'Abnormality of the musculature', 0.42185574769973755),
 ('HP:0003693', 'Distal amyotrophy', 0.418703556060791),
 ('HP:0008994',
  'Proximal muscle weakness in lower limbs',
  0.41792264580726624),
 ('HP:0031959', 'Leg dystonia', 0.417363

In [None]:
# Save the initial hits as a json file to be re-ranked using gemini
retrieval_hits=find_hpo_terms_byparagraph(whole_paragragh, hpo_data, hpo_combinedtext_embeddings, model, top_k=100)
hpo_hits_json=[{'id':x[0], 'label':x[1]} for x in retrieval_hits]

In [11]:
with open('hpo_hits.json', 'w') as f:
    json.dump(hpo_hits_json, f, indent=4)

# Re-rank with cross-encoder

In [4]:
cross_encoder = CrossEncoder('cross-encoder/ms-marco-MiniLM-L6-v2')

In [50]:
def rerank_crossencoder(query, retrieval_hits, hpo_data=hpo_data):
    hpo_initialhits=[x for x in hpo_data if x['id'] in [x[0] for x in retrieval_hits]]
    hit_texts = [hit['combined_text'] for hit in hpo_initialhits]
    sentence_pairs = [[query, hit_text] for hit_text in hit_texts]


    # --- 4. Predict Scores ---
    print("\nCalculating scores...")
    scores = cross_encoder.predict(sentence_pairs)


    # --- 5. Combine and Sort Results ---
    # Combine scores with the original hit data
    results = []
    for score, hit in zip(scores, hpo_initialhits):
        results.append({
            'score': score,
            'hit': hit
        })

    # Sort results by score in descending order (highest score first)
    sorted_results = sorted(results, key=lambda x: x['score'], reverse=True)

    # --- 6. Output Results ---
    print("\n--- Results ---")
    print(f"Query: '{query}'\n")

    print("All Scores (Sorted High to Low):")
    for i, result in enumerate(sorted_results):
        hit = result['hit']
        score = result['score']
        # Print rank, score, label, and original ID
        print(f"  Rank {i+1} | Score: {score:.4f} | Label: {hit['label']} (ID: {hit['id']})")


In [57]:
query="The patient experienced progressive muscle cramps, paresis, and wasting beginning in one leg and progressing to the other"
retrieval_hits=find_hpo_terms_byparagraph(query, hpo_data, hpo_combinedtext_embeddings, model, top_k=20)
rerank_crossencoder(query,retrieval_hits)


Calculating scores...

--- Results ---
Query: 'The patient experienced progressive muscle cramps, paresis, and wasting beginning in one leg and progressing to the other'

All Scores (Sorted High to Low):
  Rank 1 | Score: -2.2715 | Label: Exercise-induced leg cramps (ID: HP:0008991)
  Rank 2 | Score: -3.2747 | Label: Progressive muscle weakness (ID: HP:0003323)
  Rank 3 | Score: -3.7511 | Label: Progressive distal muscle weakness (ID: HP:0009063)
  Rank 4 | Score: -4.0665 | Label: Muscle spasm (ID: HP:0003394)
  Rank 5 | Score: -4.4606 | Label: Progressive proximal muscle weakness (ID: HP:0009073)
  Rank 6 | Score: -4.6834 | Label: Exercise-induced muscle cramps (ID: HP:0003710)
  Rank 7 | Score: -5.1151 | Label: Cold-induced muscle cramps (ID: HP:0003449)
  Rank 8 | Score: -6.8404 | Label: Periodic paralysis (ID: HP:0003768)
  Rank 9 | Score: -7.0826 | Label: Cold-induced hand cramps (ID: HP:0003435)
  Rank 10 | Score: -7.2706 | Label: obsolete Muscle spasm (ID: HP:0031988)
  Rank 11

In [58]:
query="at the age of 22, 5 years this German woman experienced muscle cramps, progressive paresis and muscle wasting beginning in one leg and spreading to the other leg without sensory, autonomic or cognitive symptoms. A myopathy was initially suspected but EMG revealed acute and chronic signs of denervation mostly in the legs but also slightly in the upper limbs. Transcortical magnetic evoked potential (MEP) examination was pathological with delayed central conductance time to both lower limbs but not to the upper limbs. Peripheral nerve conduction studies were normal. The medical history was unremarkable except for a voluntarily weight loss of ≈10 kg in the years before appearance of muscle symptoms. Both the parents have diabetes mellitus type 2, but there is no family history of a neuromuscular disease (NMD) or FTD-like condition. The patient eventually received a diagnosis of sALS. The disease has developed slowly, symptoms and signs of upper motor neuron (UMN) and lower motor neuron (LMN) damage only appearing in the upper limbs 3 years after onset in the lower limbs. Nine years after onset of paresis, the patient is alive and still show no bulbar symptoms."
retrieval_hits=find_hpo_terms_byparagraph(query, hpo_data, hpo_combinedtext_embeddings, model, top_k=20)
rerank_crossencoder(query,retrieval_hits)


Calculating scores...

--- Results ---
Query: 'at the age of 22, 5 years this German woman experienced muscle cramps, progressive paresis and muscle wasting beginning in one leg and spreading to the other leg without sensory, autonomic or cognitive symptoms. A myopathy was initially suspected but EMG revealed acute and chronic signs of denervation mostly in the legs but also slightly in the upper limbs. Transcortical magnetic evoked potential (MEP) examination was pathological with delayed central conductance time to both lower limbs but not to the upper limbs. Peripheral nerve conduction studies were normal. The medical history was unremarkable except for a voluntarily weight loss of ≈10 kg in the years before appearance of muscle symptoms. Both the parents have diabetes mellitus type 2, but there is no family history of a neuromuscular disease (NMD) or FTD-like condition. The patient eventually received a diagnosis of sALS. The disease has developed slowly, symptoms and signs of upp