<a href="https://colab.research.google.com/github/louisdennington-design/decision-tree-dissertation/blob/main/medcat_normalisation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [5]:
# Mount Google Drive
#drive.flush_and_unmount()
#!rm -rf /content/drive
#!mkdir -p /content/drive

from google.colab import drive
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [3]:
!pip install -q medcat
!python -m spacy download en_core_web_md
from medcat.cat import CAT
import os
import json

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/307.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m307.2/307.8 kB[0m [31m9.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m307.8/307.8 kB[0m [31m6.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting en-core-web-md==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_md-3.8.0/en_core_web_md-3.8.0-py3-none-any.whl (33.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m33.5/33.5 MB[0m [31m53.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: en-core-web-md
Successfully installed en-core-web-md-3.8.0
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_md')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
or

In [6]:
# Load the model pack - almost 1GB so takes a while

medcat_zipfile_location = "/content/drive/My Drive/Colab Notebooks/Dissertation/MedCAT"

os.makedirs(medcat_zipfile_location, exist_ok=True)

medcat_model_zipfile = 'v2_Snomed2025_MIMIC_IV_bbe806e192df009f.zip'

MEDCAT_LOAD = os.path.join(medcat_zipfile_location, medcat_model_zipfile)

cat = CAT.load_model_pack(MEDCAT_LOAD)



In [7]:
# Path variables

LOAD_PATH = "/content/drive/My Drive/Colab Notebooks/Dissertation/JSON"
LOAD_GUIDELINE_STRUCTURED = os.path.join(LOAD_PATH, "guideline_structured.json")

In [8]:
# Load JSON

def load_json(file_path):
    try:
        with open(file_path, "r", encoding="utf-8") as f:
            return json.load(f)
    except FileNotFoundError:
        raise FileNotFoundError(f'JSON file not found: {file_path}')

guideline_structured = load_json(LOAD_GUIDELINE_STRUCTURED)

print(type(guideline_structured))
print(len(guideline_structured))
print(guideline_structured)

<class 'list'>
116


In [10]:
# Test (from original documentation)

text = "The patient has a history of ectopic pregnancy but no current symptoms of miscarriage."
entities = cat.get_entities(text)

# This returns a dictionary of entities found

for entity_id, entity_data in entities['entities'].items():
    print(f"Term: {entity_data['source_value']}")
    print(f"SNOMED/CUI: {entity_data['cui']}")
    print(f"Confidence: {entity_data['context_similarity']}")

    # If using meta-annotations (Status/Certainty):
    print(f"Meta-Annotations: {entity_data.get('meta_anns')}")



Term: patient has
SNOMED/CUI: 25609006
Confidence: 1
Meta-Annotations: {}
Term: history of ectopic pregnancy
SNOMED/CUI: 161763005
Confidence: 1
Meta-Annotations: {}
Term: current
SNOMED/CUI: 15240007
Confidence: 1
Meta-Annotations: {}
Term: miscarriage
SNOMED/CUI: 17369002
Confidence: 1
Meta-Annotations: {}


In [12]:
def medcat_norm(structured_json_file):

    for entry in structured_json_file: # Because the outer structure is a list []

        for key, value in list(entry.items()): # While the inner structure is a dictionary {} so uses .items()
            if key == 'diagnoses':
                joined_text = " ".join(value) if isinstance(value, list) else (value or "")
                entry['diagnosis_normed'] = cat.get_entities(joined_text)

            elif key == 'current_medication':
                joined_text = " ".join(value) if isinstance(value, list) else (value or "")
                entry['current_medication_normed'] = cat.get_entities(joined_text)

            elif key == 'physical_health_longterm':
                joined_text = " ".join(value) if isinstance(value, list) else (value or "")
                entry['physical_health_longterm_normed'] = cat.get_entities(joined_text)

            elif key == 'physical_health_recent':
                joined_text = " ".join(value) if isinstance(value, list) else (value or "")
                entry['physical_health_recent_normed'] = cat.get_entities(joined_text)

    with open('/content/drive/My Drive/Colab Notebooks/Dissertation/JSON/guideline_structured_normed.json', "w", encoding="utf-8") as f:
        json.dump(structured_json_file, f, ensure_ascii=False, indent=2)

    return structured_json_file

In [14]:
data = medcat_norm(guideline_structured)

print(data)

