In [2]:
import pandas as pd
import re

mimic_df = pd.read_csv("mimiciv_subset_cleaned.csv")

with open("mayoclinic_symptom_list.txt") as f:
    symptom_list = [line.strip() for line in f]

symptom_pattern = re.compile(r'\b(' + '|'.join(re.escape(sym) for sym in symptom_list) + r')\b', re.IGNORECASE)

def extract_findings(text):
    return list(set(symptom_pattern.findall(text)))

mimic_df["findings"] = mimic_df["text"].apply(extract_findings)

In [3]:
output_filename = "mimiciv_subset_cleaned_w_findings.csv"
mimic_df.to_csv(output_filename, index=False)

print(f"✅ Subset exported to {output_filename} with {len(mimic_df)} rows.")

✅ Subset exported to mimiciv_subset_cleaned_w_findings.csv with 1000 rows.


In [6]:
from collections import defaultdict

kb = defaultdict(lambda: defaultdict(int))

for _, row in mimic_df.iterrows():
    disease = row["long_title"]
    findings = row["findings"]
    
    for finding in findings:
        kb[disease][finding] += 1

normalized_kb = {}
for diagnosis, findings in kb.items():
    total = sum(findings.values())
    normalized_kb[diagnosis] = {
        symptom: count / total
        for symptom, count in findings.items()
    }

In [None]:
kb_output_csv = "mimic_4_kb_w_freq.csv"

flattened = []
for disease, findings in normalized_kb.items():
    for symptom, freq in findings.items():
        flattened.append({"diagnosis": disease, "symptom": symptom, "frequency": freq})

df_kb = pd.DataFrame(flattened)
df_kb.to_csv(kb_output_csv, index=False)

print(f"✅ Knowledge base created with {len(normalized_kb)} diagnoses.")
print(f"→ Saved to:\n- {kb_output_csv}")

✅ Knowledge base created with 541 diagnoses.
→ Saved to:
- mimic_4_kb_w_freq
