In [7]:
import pandas as pd
import re

mimic_df = pd.read_csv("output/mimiciv_subset_cleaned.csv")

with open("revised_input/mayoclinic_symptom_list.txt") as f:
    symptom_list = [line.strip().lower() for line in f if line.strip()]

symptom_pattern = re.compile(r'\b(' + '|'.join(re.escape(sym) for sym in symptom_list) + r')\b', re.IGNORECASE)

def extract_findings(text):
    return list(set(symptom_pattern.findall(text)))

mimic_df["findings"] = mimic_df["text"].apply(extract_findings)

In [8]:
output_filename = "output/mimiciv_subset_cleaned_w_findings.csv"
mimic_df.to_csv(output_filename, index=False)

print(f"✅ Subset exported to {output_filename} with {len(mimic_df)} rows.")

✅ Subset exported to output/mimiciv_subset_cleaned_w_findings.csv with 1000 rows.


In [13]:
import pandas as pd
input_filename = "output/mimiciv_subset_cleaned_w_findings.csv"
mimic_df = pd.read_csv(input_filename)
mimic_df

Unnamed: 0,subject_id,hadm_id,icd_code,long_title,text,findings
0,12468016,20568160,Z7901,Long term (current) use of anticoagulants,name no admission date discharge date date of ...,['nausea']
1,14597501,22057711,V4986,Do not resuscitate status,name unit no admission date discharge date dat...,"['fatigue', 'fever', 'back pain', 'chills']"
2,15175804,29647957,M542,Cervicalgia,name unit no admission date discharge date dat...,"['diarrhea', 'vomiting', 'nausea', 'cough', 'b..."
3,10774186,26379427,80504,Closed fracture of fourth cervical vertebra,name unit no admission date discharge date dat...,"['diarrhea', 'numbness', 'nausea', 'cough', 's..."
4,15295532,26998628,60000,Hypertrophy (benign) of prostate without urina...,name unit no admission date discharge date dat...,"['diarrhea', 'shortness of breath', 'numbness'..."
...,...,...,...,...,...,...
995,18932584,24510713,2841,Pancytopenia,name unit no admission date discharge date dat...,"['diarrhea', 'shortness of breath', 'vomiting'..."
996,14538142,25693064,K219,Gastro-esophageal reflux disease without esoph...,name unit no admission date discharge date dat...,"['diarrhea', 'shortness of breath', 'vomiting'..."
997,16337384,29662980,V5861,Long-term (current) use of anticoagulants,name unit no admission date discharge date dat...,"['diarrhea', 'night sweats', 'shortness of bre..."
998,16339701,27079563,27800,"Obesity, unspecified",name unit no admission date discharge date dat...,"['fatigue', 'insomnia', 'shortness of breath',..."


In [24]:
from collections import defaultdict


kb = defaultdict(lambda: defaultdict(int))

for _, row in mimic_df.iterrows():
    disease = row["long_title"]
    findings = eval(row["findings"])
    
    for finding in list(findings):
        kb[disease][finding] += 1

normalized_kb = {}
for diagnosis, findings in kb.items():
    total = sum(findings.values())
    normalized_kb[diagnosis] = {
        symptom: count / total
        for symptom, count in findings.items()
    }

kb

defaultdict(<function __main__.<lambda>()>,
            {'Long term (current) use of anticoagulants': defaultdict(int,
                         {'nausea': 2,
                          'diarrhea': 2,
                          'insomnia': 1,
                          'shortness of breath': 3,
                          'leg swelling': 2,
                          'cough': 2,
                          'abdominal pain': 2,
                          'fever': 1,
                          'chills': 2,
                          'numbness': 1,
                          'fatigue': 2,
                          'back pain': 2,
                          'vomiting': 1,
                          'foot pain': 1}),
             'Do not resuscitate status': defaultdict(int,
                         {'fatigue': 2,
                          'fever': 1,
                          'back pain': 1,
                          'chills': 2,
                          'headache': 1,
                          'nausea'

In [25]:
import json

kb_output_csv = "output/mimic_4_kb_w_freq.csv"
kb_output_json = "output/mimic_4_kb_w_freq.json"

# === SAVE AS JSON ===
with open(kb_output_json, "w") as f:
    json.dump(normalized_kb, f, indent=2)

# === SAVE AS CSV (flattened) ===
flattened = []
for disease, findings in normalized_kb.items():
    for symptom, freq in findings.items():
        flattened.append({"diagnosis": disease, "symptom": symptom, "frequency": freq})

df_kb = pd.DataFrame(flattened)
df_kb.to_csv(kb_output_csv, index=False)

print(f"✅ Knowledge base created with {len(normalized_kb)} diagnoses.")
print(f"→ Saved to:\n- {kb_output_json}\n- {kb_output_csv}")

✅ Knowledge base created with 567 diagnoses.
→ Saved to:
- output/mimic_4_kb_w_freq.json
- output/mimic_4_kb_w_freq.csv
