<a href="https://colab.research.google.com/github/matebestek/AI-Scientist/blob/main/Clinical_Coding.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# How to do clinical coding with LLMs

Clinical coding is the act of detecting biomedical concepts (diseases, symptoms, medications, ...) in free text and linking them to a biomedical ontology like SNOMED, ICD10, or any other.

In [None]:
import torch
import torch.nn.functional as F
from torch import Tensor

from transformers import AutoTokenizer, AutoModel
import numpy as np

### Data

In [None]:
# This is LLM generated, some could be fake SNOMED codes, it is just a mini test ontology
cdb = {
  "diseases": [
    {
      "name": "Hypertension",
      "snomed_code": "38341003"
    },
    {
      "name": "Type 2 Diabetes Mellitus",
      "snomed_code": "44054006"
    },
    {
      "name": "Hyperlipidemia",
      "snomed_code": "55822004"
    },
    {
      "name": "Major Depressive Disorder",
      "snomed_code": "370143000"
    },
    {
      "name": "Asthma",
      "snomed_code": "195967001"
    },
    {
      "name": "Coronary Artery Disease",
      "snomed_code": "53741008"
    },
    {
      "name": "Obesity",
      "snomed_code": "414916001"
    },
    {
      "name": "Osteoarthritis",
      "snomed_code": "396275006"
    },
    {
      "name": "Chronic Obstructive Pulmonary Disease",
      "snomed_code": "13645005"
    },
    {
      "name": "Hypothyroidism",
      "snomed_code": "40930008"
    },
    {
      "name": "Anxiety Disorder",
      "snomed_code": "197480006"
    },
    {
      "name": "Gastroesophageal Reflux Disease",
      "snomed_code": "235595009"
    },
    {
      "name": "Migraine",
      "snomed_code": "37796009"
    },
    {
      "name": "Atrial Fibrillation",
      "snomed_code": "49436004"
    },
    {
      "name": "Chronic Kidney Disease",
      "snomed_code": "709044004"
    },
    {
      "name": "Allergic Rhinitis",
      "snomed_code": "61582004"
    },
    {
      "name": "Stroke",
      "snomed_code": "230690007"
    },
    {
      "name": "Heart Failure",
      "snomed_code": "84114007"
    },
    {
      "name": "Insomnia",
      "snomed_code": "73430006"
    },
    {
      "name": "Osteoporosis",
      "snomed_code": "64859006"
    },
    {
      "name": "Urinary Tract Infection",
      "snomed_code": "68566005"
    },
    {
      "name": "Vitamin D Deficiency",
      "snomed_code": "34713006"
    },
    {
      "name": "Iron Deficiency Anemia",
      "snomed_code": "87522002"
    },
    {
      "name": "Eczema",
      "snomed_code": "43116000"
    },
    {
      "name": "Fibromyalgia",
      "snomed_code": "203082005"
    },
    {
      "name": "Irritable Bowel Syndrome",
      "snomed_code": "10743008"
    },
    {
      "name": "Alzheimer's Disease",
      "snomed_code": "26929004"
    },
    {
      "name": "Parkinson's Disease",
      "snomed_code": "49049000"
    },
    {
      "name": "Pneumonia",
      "snomed_code": "233604007"
    },
    {
      "name": "Influenza",
      "snomed_code": "6142004"
    },
    {
      "name": "Rheumatoid Arthritis",
      "snomed_code": "69896004"
    },
    {
      "name": "Attention Deficit Hyperactivity Disorder",
      "snomed_code": "406506008"
    },
    {
      "name": "Epilepsy",
      "snomed_code": "84757009"
    },
    {
      "name": "Psoriasis",
      "snomed_code": "9014002"
    },
    {
      "name": "Celiac Disease",
      "snomed_code": "396331005"
    },
    {
      "name": "Chronic Sinusitis",
      "snomed_code": "40055000"
    },
    {
      "name": "Vertigo",
      "snomed_code": "399153001"
    },
    {
      "name": "Gout",
      "snomed_code": "90560007"
    },
    {
      "name": "Bipolar Disorder",
      "snomed_code": "13746004"
    },
    {
      "name": "Multiple Sclerosis",
      "snomed_code": "24700007"
    },
    {
      "name": "Glaucoma",
      "snomed_code": "23986001"
    },
    {
      "name": "Cataracts",
      "snomed_code": "193570009"
    },
    {
      "name": "Benign Prostatic Hyperplasia",
      "snomed_code": "266569009"
    },
    {
      "name": "Schizophrenia",
      "snomed_code": "58214004"
    },
    {
      "name": "HIV Infection",
      "snomed_code": "86406008"
    },
    {
      "name": "Hepatitis C",
      "snomed_code": "50711007"
    },
    {
      "name": "Deep Vein Thrombosis",
      "snomed_code": "128053003"
    },
    {
      "name": "Pulmonary Embolism",
      "snomed_code": "59282003"
    },
    {
      "name": "Peripheral Vascular Disease",
      "snomed_code": "400047006"
    }
  ]
}

In [None]:
# Clinical note, this is synthetic
clinical_note = '''PROGRESS NOTE
Date: 03/15/2025
Patient: Johnson, Robert M.
DOB: 05/22/1958
MRN: 763421
Provider: Dr. Michaels

SUBJECTIVE:
67y.o. male presents for f/u of multiple chronic conditions. Pt reports worsening SOB over past 2 wks, especially w/ exertion. C/o intermittent chest discomfort. Has been using rescue inhaler 3-4x/day for COPD sx. Reports poor sleep (only 4-5 hrs/night) w/ early waking. HTN meds "seem to be working ok." Denies changes in BG readings for T2DM. Reports occ. HA but attributes to "usual migraines." Recent weight gain of ~5 lbs, which pt attributes to decreased activity due to knee pain from OA. Continues to have reflux sx despite compliance w/ GERD meds. No new fx episodes per pt, although notes "feeling more down" since last visit. AF reportedly stable per home monitoring. No orthopnea or PND noted per pt.
OBJECTIVE:
VS: BP 148/92, HR 88 (irregularly irregular), RR 18, T 98.4°F, SpO2 93% on RA
Wt: 102.4 kg (+2.2 kg from last visit)
GEN: Alert, appears older than stated age, mild distress w/ breathing
HEENT: NCAT, PERRL, EOMI, dry oral mucosa
CV: Irreg rhythm, no m/r/g appreciated
RESP: Decreased BS bilaterally, prolonged expiratory phase, scattered wheezes
ABD: Soft, NT/ND, (+) BS
MSK: B/L knee crepitus, R>L, limited ROM due to pain
NEURO: A&Ox3, CN II-XII intact, normal gait w/ mild antalgic component
ASSESSMENT/PLAN:

COPD - Exacerbation likely. Increase Symbicort to 160/4.5 BID. Add prednisone 40mg daily x5 days. Consider PFTs at next visit.
HTN - Suboptimally controlled. Increase lisinopril from 20mg to 30mg daily. Continue HCTZ 25mg daily. F/U BP in 2 weeks.
T2DM - Stable. Last A1c 7.3%. Continue metformin 1000mg BID and empagliflozin 10mg daily.
A-fib - Stable per pt. Continue apixaban 5mg BID. EKG today shows rate-controlled AF. Will discuss w/ cards re: rhythm control options vs. rate control.
MDD - Worsening symptoms. Increase sertraline from 50mg to 75mg daily. Refer to psychiatry for evaluation. PHQ-9 score today: 14.
OA - Worsening knee pain. Recommend PT referral. Consider ortho referral for evaluation for possible injection. Increase acetaminophen to 1000mg TID.
GERD - Refractory to current tx. Increase omeprazole to 40mg BID. Reinforce dietary modifications. Consider GI referral if no improvement.
Migraine - Stable. Continue sumatriptan 50mg PRN.
Insomnia - Chronic, worsening. Refer to sleep medicine. Trial of trazodone 50mg QHS. Sleep hygiene education provided.
Obesity - BMI 35.2. Nutritional counseling provided. Consider weight management program.

F/U: 4 weeks

Sarah Michaels, MD
Internal Medicine'''

## NER via Gemini

In [None]:
from google import genai
from pydantic import BaseModel, EmailStr, Field
from typing import List, Optional, Dict

In [None]:
client = genai.Client(
    api_key='' # YOUR API KEY
)

We are asking the LLM to handle standardisation (one of the hardest parts of NER+L) and the detection of entities in text (i.e. the NER part).

In [None]:
system_prompt = '''You are a medical text extraction assistant specialized in converting clinical narratives into structured JSON data.
Your task is to extract all disease mentions from the text

Output instructions:
- raw_string - this represents the original string of the entity found in the input text
- standardised_biomedical_name - this is the standardised name of the entity found in the input text (e.g. if the raw_string is "T1DM" the normalized_biomedical_name should be "Type 1 Diabetes Mellitus")

Input:
'''

class Entity(BaseModel):
    raw_string: str = Field(default=None, nullable=True)
    standardised_biomedical_name: str = Field(default=None, nullable=True)

class Entities(BaseModel):
    entities: List[Entity]

In [None]:
response = client.models.generate_content(
    model='gemini-2.0-flash',

    config={
        'response_mime_type': 'application/json',
        'response_schema': Entities,
    },
    contents=system_prompt + "\n" + clinical_note,
)

In [None]:
entities = Entities.model_validate_json(response.text)

In [None]:
for e in entities.entities:
    # Print name, std name
    print(e.raw_string, "-", e.standardised_biomedical_name)

COPD - Chronic Obstructive Pulmonary Disease
HTN - Hypertension
T2DM - Type 2 Diabetes Mellitus
OA - Osteoarthritis
GERD - Gastroesophageal Reflux Disease
AF - Atrial Fibrillation
COPD - Chronic Obstructive Pulmonary Disease
HTN - Hypertension
T2DM - Type 2 Diabetes Mellitus
A-fib - Atrial Fibrillation
MDD - Major Depressive Disorder
OA - Osteoarthritis
GERD - Gastroesophageal Reflux Disease
Migraine - Migraine
Insomnia - Insomnia
Obesity - Obesity


## Linking

In [None]:
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    #
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery: {query}'

In [None]:
# By default this will load on CPU, take care that it will be slow to embed the whole ontology, but it is fine for query embedding
tokenizer = AutoTokenizer.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)
model = AutoModel.from_pretrained('Alibaba-NLP/gte-Qwen2-1.5B-instruct', trust_remote_code=True)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [None]:
MAX_CONCEPT_NAME_LENGTH = 64 # Even this is too much

In [None]:
_ = model.to('cuda') # Switch to cuda

In [None]:
codes = [disease['snomed_code'] for disease in cdb['diseases']]
names = [disease['name'] for disease in cdb['diseases']]

# Let's embed all the names, very fast on GPUs (if you are embedding a big CDB consider batching the input)
# Normally this would be saved somewhere, maybe even a vector database can be used, but I do not see a need for it (even in the case of a full SNOMED/UMLS)
batch_dict = tokenizer(names, max_length=MAX_CONCEPT_NAME_LENGTH, padding=True, truncation=True, return_tensors='pt').to('cuda')
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
concept_embeddings = F.normalize(embeddings, p=2, dim=1)

In [None]:
# Embed the found entities
entity_names = [e.standardised_biomedical_name for e in entities.entities]
entity_names = entity_names

# Let's embed all the names, very fast on GPUs (if you are embedding a big CDB consider batching the input)
batch_dict = tokenizer(entity_names, max_length=MAX_CONCEPT_NAME_LENGTH, padding=True, truncation=True, return_tensors='pt').to('cuda')
outputs = model(**batch_dict)
embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])

# normalize embeddings
entity_embeddings = F.normalize(embeddings, p=2, dim=1)

In [None]:
scores = (entity_embeddings @ concept_embeddings.T) * 100

In [None]:
arg_scores = np.argsort(-1 * scores.cpu().detach().numpy())

In [None]:
for ind, e in enumerate(entities.entities):
    print(f"Raw text was: `{e.raw_string}`, top candidate:")
    for i in range(1):
        print(f"  - {names[arg_scores[ind][i]]} - ({codes[arg_scores[ind][i]]}) - score: {scores[ind][arg_scores[ind][i]]:.2f}")
# The model is so good at standardisation that vector similarity is rarely needed for this small ontology, in case of big ontologies it will be much more important

Raw text was: `COPD`, top candidate:
  - Chronic Obstructive Pulmonary Disease - (13645005) - score: 100.00
Raw text was: `HTN`, top candidate:
  - Hypertension - (38341003) - score: 100.00
Raw text was: `T2DM`, top candidate:
  - Type 2 Diabetes Mellitus - (44054006) - score: 100.00
Raw text was: `OA`, top candidate:
  - Osteoarthritis - (396275006) - score: 100.00
Raw text was: `GERD`, top candidate:
  - Gastroesophageal Reflux Disease - (235595009) - score: 100.00
Raw text was: `AF`, top candidate:
  - Atrial Fibrillation - (49436004) - score: 100.00
Raw text was: `COPD`, top candidate:
  - Chronic Obstructive Pulmonary Disease - (13645005) - score: 100.00
Raw text was: `HTN`, top candidate:
  - Hypertension - (38341003) - score: 100.00
Raw text was: `T2DM`, top candidate:
  - Type 2 Diabetes Mellitus - (44054006) - score: 100.00
Raw text was: `A-fib`, top candidate:
  - Atrial Fibrillation - (49436004) - score: 100.00
Raw text was: `MDD`, top candidate:
  - Major Depressive Disorde

In [None]:
print(clinical_note)

PROGRESS NOTE
Date: 03/15/2025
Patient: Johnson, Robert M.
DOB: 05/22/1958
MRN: 763421
Provider: Dr. Michaels

SUBJECTIVE:
67y.o. male presents for f/u of multiple chronic conditions. Pt reports worsening SOB over past 2 wks, especially w/ exertion. C/o intermittent chest discomfort. Has been using rescue inhaler 3-4x/day for COPD sx. Reports poor sleep (only 4-5 hrs/night) w/ early waking. HTN meds "seem to be working ok." Denies changes in BG readings for T2DM. Reports occ. HA but attributes to "usual migraines." Recent weight gain of ~5 lbs, which pt attributes to decreased activity due to knee pain from OA. Continues to have reflux sx despite compliance w/ GERD meds. No new fx episodes per pt, although notes "feeling more down" since last visit. AF reportedly stable per home monitoring. No orthopnea or PND noted per pt.
OBJECTIVE:
VS: BP 148/92, HR 88 (irregularly irregular), RR 18, T 98.4°F, SpO2 93% on RA
Wt: 102.4 kg (+2.2 kg from last visit)
GEN: Alert, appears older than state