In [1]:
import pandas as pd
import re

mimic_df = pd.read_csv("output/mimiciv_subset_cleaned.csv")

with open("revised_input/mayoclinic_symptom_list.txt") as f:
    symptom_list = [line.strip().lower() for line in f if line.strip()]

symptom_pattern = re.compile(r'\b(' + '|'.join(re.escape(sym) for sym in symptom_list) + r')\b', re.IGNORECASE)

def extract_findings(text):
    return list(set(symptom_pattern.findall(text)))

mimic_df["findings"] = mimic_df["text"].apply(extract_findings)

In [None]:
# output_filename = "output/mimiciv_subset_cleaned_w_findings.csv"
# mimic_df.to_csv(output_filename, index=False)

# print(f"✅ Subset exported to {output_filename} with {len(mimic_df)} rows.")

✅ Subset exported to output/mimiciv_subset_cleaned_w_findings.csv with 1000 rows.


In [27]:
# import pandas as pd
# input_filename = "output/mimiciv_subset_cleaned_w_findings.csv"
# mimic_df = pd.read_csv(input_filename)
mimic_df.head(10)

Unnamed: 0,subject_id,hadm_id,icd_code,long_title,text,findings
0,13509314,28507435,78061,Fever presenting with conditions classified el...,name unit no admission date discharge date dat...,"[diarrhea, shortness of breath, fatigue, nause..."
1,15246843,22848512,C787,Secondary malignant neoplasm of liver and intr...,name unit no admission date discharge date dat...,"[fatigue, abdominal pain]"
2,18129598,27958727,70724,"Pressure ulcer, stage IV",name unit no admission date discharge date dat...,"[diarrhea, insomnia, shortness of breath, fati..."
3,18298823,23855380,Z944,Liver transplant status,name unit no admission date discharge date dat...,"[diarrhea, shortness of breath, numbness, vomi..."
4,16639866,29978258,30000,"Anxiety state, unspecified",name unit no admission date discharge date dat...,"[fever, vomiting, shortness of breath, chills]"
5,15783046,20606881,V600,Driver of heavy transport vehicle injured in c...,name unit no admission date discharge date dat...,"[wheezing, cough, shortness of breath, dizziness]"
6,13555772,28677947,2448,Other specified acquired hypothyroidism,name unit no admission date discharge date dat...,"[diarrhea, vomiting, headache, nausea]"
7,16442091,20054838,K8051,Calculus of bile duct without cholangitis or c...,name unit no admission date discharge date dat...,"[shortness of breath, nausea, cough, abdominal..."
8,19695008,24342199,1911,Malignant neoplasm of frontal lobe,name unit no admission date discharge date dat...,"[fatigue, fever, numbness, headache]"
9,19997886,20793010,K3189,Other diseases of stomach and duodenum,name unit no admission date discharge date dat...,"[shortness of breath, cough, dizziness, fever,..."


In [2]:
from collections import defaultdict


kb = defaultdict(lambda: defaultdict(int))

for _, row in mimic_df.iterrows():
    disease = row["long_title"]
    findings = row["findings"]
    
    for finding in list(findings):
        kb[disease][finding] += 1

normalized_kb = {}
for diagnosis, findings in kb.items():
    total = sum(findings.values())
    normalized_kb[diagnosis] = {
        symptom: count / total
        for symptom, count in findings.items()
    }


In [4]:
import json

kb_output_json = "output/mimic_4_kb_w_freq.json"

# === SAVE AS JSON ===
with open(kb_output_json, "w") as f:
    json.dump(normalized_kb, f, indent=2)

# === SAVE AS CSV (flattened) ===
kb_output_csv = "output/mimic_4_kb_w_freq.csv"
flattened = []
for disease, findings in normalized_kb.items():
    for symptom, freq in findings.items():
        flattened.append({"diagnosis": disease, "symptom": symptom, "frequency": freq})

df_kb = pd.DataFrame(flattened)
df_kb.to_csv(kb_output_csv, index=False)

print(f"✅ Knowledge base created with {len(normalized_kb)} diagnoses.")

✅ Knowledge base created with 141 diagnoses.
