In [None]:
import pandas as pd
from os.path import join

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_rows', 200)
pd.set_option('display.min_rows', 200)
pd.set_option('display.expand_frame_repr', True)

base_mimic = ''
base_new = ""

# Dataset

## Functions

In [None]:
from dataset.dataset import load_data, extract_info, extract_hadm_ids, extract_hadm_ids_filter_cc

## Pathologies

In [None]:
# Load data

reload = True

if reload:
    admissions_df, transfers_df, diag_icd, procedures_df, discharge_df, radiology_report_df, radiology_report_details_df, lab_events_df, microbiology_df = load_data(base_mimic)
    admissions_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'admissions.csv'), index=False)
    transfers_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'transfers.csv'), index=False)
    diag_icd.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'diagnoses_icd.csv'), index=False)
    procedures_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'procedures_icd.csv'), index=False)
    discharge_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'discharge_notes.csv'), index=False)
    radiology_report_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'radiology_reports.csv'), index=False)
    radiology_report_details_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'radiology_report_details.csv'), index=False)
    lab_events_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'labevents.csv'), index=False)
    microbiology_df.to_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'microbiologyevents.csv'), index=False)
else:
    admissions_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'admissions.csv'))
    transfers_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'transfers.csv'))
    diag_icd = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'diagnoses_icd.csv'))
    procedures_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'procedures_icd.csv'))
    discharge_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'discharge_notes.csv'))
    radiology_report_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'radiology_reports.csv'))
    radiology_report_details_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'radiology_report_details.csv'))
    lab_events_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'labevents.csv'))
    microbiology_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'microbiologyevents.csv'))

### Appendicitis

In [None]:
# Grab all hadm_ids with appendicitis
app_hadm_ids = extract_hadm_ids('acute appendicitis', diag_icd, discharge_df)

In [None]:
app_hadm_info, app_hadm_info_clean = extract_info(app_hadm_ids, 'appendicitis', ['acute appendicitis', 'appendicitis', 'appendectomy'],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

### Cholecystitis

In [None]:
cholec_hadm_ids = extract_hadm_ids('acute cholecystitis', diag_icd, discharge_df)

In [None]:
cholec_hadm_info, cholec_hadm_info_clean = extract_info(cholec_hadm_ids, 'cholecystitis', ['acute cholecystitis', 'cholecystitis', 'cholecystostomy'], discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

### Pancreatitis

In [None]:
pancr_hadm_ids = extract_hadm_ids('acute pancreatitis', diag_icd, discharge_df)

In [None]:
pancr_hadm_info, pancr_hadm_info_clean = extract_info(pancr_hadm_ids, 'pancreatitis', ['acute pancreatitis', 'pancreatitis', 'pancreatectomy'],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

### Diverticulitis

In [None]:
divert_hadm_ids = extract_hadm_ids('diverticulitis', diag_icd, discharge_df, diag_counts=30, cc=10)

In [None]:
divert_hadm_info, divert_hadm_info_clean = extract_info(divert_hadm_ids, 'diverticulitis', ['acute diverticulitis', 'diverticulitis'],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

### Filter for First Diag and Dr Subset

In [None]:
import pickle
from utils.nlp import extract_primary_diagnosis

all_pathos = ['appendicitis', 'cholecystitis', 'pancreatitis', 'diverticulitis']

dr_eval = {}

# randomly sampled 20 cases from each patho
dr_eval['appendicitis'] = [20414022, 20921058, 21528320, 22360162, 23101737, 23459798, 23472780, 23553042, 24613821, 25579760, 25731420, 26064146, 27022057, 27260340, 28174867, 28466255, 29080331, 29468247, 29646721, 29815898]
dr_eval['cholecystitis'] = [20491815, 22023307, 22386848, 22825632, 23322902, 24642301, 24646115, 25643992, 26014747, 26146550, 26286187, 26354137, 26679345, 26983655, 27286714, 28342261, 28862495, 29573603, 29580001, 29723478]
dr_eval['pancreatitis'] = [20275938, 20464014, 20804346, 21238215, 21285450, 21849575, 22778345, 23507935, 23869693, 24338433, 24571788, 24706695, 25693057, 25706907, 25779570, 26086670, 26351914, 27875265, 29037588, 29413431]
dr_eval['diverticulitis'] = [20348908, 20754081, 21177686, 21233315, 21793374, 21906103, 22631597, 24009412, 24188879, 25568418, 25682814, 26581302, 27371462, 27794752, 27989275, 28678157, 28967154, 29137933, 29270681, 29781321]
dr_eval['gastritis'] = [23541137, 25942424, 27148050, 29661958, 29405818]
dr_eval['urinary_tract_infection'] = [23228674, 21812195, 28441616, 26600738, 27795432]
dr_eval['esophageal_reflux'] = [27209421, 22004397, 27318752, 27297450, 29649502]
dr_eval['hernia'] = [28020857, 24364147, 21309128, 26512162, 26027327]

# Manual corrections after case review. These _ids have multiple diagnoses of our abdominal pathologies and are thus too inspecific
multi_diag_ids = [26769588, 24309551, 20525915, 23074436]

id_difficulty = {}
for patho, hadm_info in zip(['appendicitis', 'cholecystitis', 'pancreatitis', 'diverticulitis'],
                            [app_hadm_info_clean, cholec_hadm_info_clean, pancr_hadm_info_clean, divert_hadm_info_clean]):
    first_diag_ids = []
    for p in hadm_info:
        if p in multi_diag_ids:
            continue
        dd = hadm_info[p]['Discharge Diagnosis']
        dd = dd.lower()
        first_diag = extract_primary_diagnosis(dd)
        if first_diag and patho in first_diag.lower():
            first_diag_ids.append(p)
    
    id_difficulty[patho] = {'first_diag': first_diag_ids, 'dr_eval': dr_eval[patho]}
    print(f"There are {len(first_diag_ids)} {patho} cases with first diagnosis out of {len(hadm_info)} total cases")
    print()

id_difficulty['gastritis'] = {}
id_difficulty['gastritis']['dr_eval'] = dr_eval['gastritis']

id_difficulty['urinary_tract_infection'] = {}
id_difficulty['urinary_tract_infection']['dr_eval'] = dr_eval['urinary_tract_infection']

id_difficulty['esophageal_reflux'] = {}
id_difficulty['esophageal_reflux']['dr_eval'] = dr_eval['esophageal_reflux']

id_difficulty['hernia'] = {}
id_difficulty['hernia']['dr_eval'] = dr_eval['hernia']

pickle.dump(id_difficulty, open(join(base_new, 'id_difficulty.pkl'), 'wb'))

In [None]:
import pickle
id_difficulty = pickle.load(
        open(join(base_new, "id_difficulty.pkl"), "rb")
)
for patho, hadm_info in zip(['appendicitis', 'cholecystitis', 'pancreatitis', 'diverticulitis'],
                            [app_hadm_info, cholec_hadm_info, pancr_hadm_info, divert_hadm_info]):
    hadm_info_firstdiag = {}
    for _id in id_difficulty[patho]['first_diag']:
        hadm_info_firstdiag[_id] = hadm_info[_id]
    pickle.dump(hadm_info_firstdiag, open(join(base_new, f"{patho}_hadm_info_first_diag.pkl"), "wb"))

## Full Dataset

In [None]:
from dataset.utils import load_hadm_from_file

app_hadm_info                   = load_hadm_from_file('appendicitis_hadm_info', base_new)
app_hadm_info_clean             = load_hadm_from_file('appendicitis_hadm_info_clean', base_new)
app_hadm_info_firstdiag         = load_hadm_from_file('appendicitis_hadm_info_first_diag', base_new)

cholec_hadm_info                = load_hadm_from_file('cholecystitis_hadm_info', base_new)
cholec_hadm_info_clean          = load_hadm_from_file('cholecystitis_hadm_info_clean', base_new)
cholec_hadm_info_firstdiag      = load_hadm_from_file('cholecystitis_hadm_info_first_diag', base_new)

pancr_hadm_info                 = load_hadm_from_file('pancreatitis_hadm_info', base_new)
pancr_hadm_info_clean           = load_hadm_from_file('pancreatitis_hadm_info_clean', base_new)
pancr_hadm_info_firstdiag       = load_hadm_from_file('pancreatitis_hadm_info_first_diag', base_new)

divert_hadm_info                = load_hadm_from_file('diverticulitis_hadm_info', base_new)
divert_hadm_info_clean          = load_hadm_from_file('diverticulitis_hadm_info_clean', base_new)
divert_hadm_info_firstdiag      = load_hadm_from_file('diverticulitis_hadm_info_first_diag', base_new)

app_hadm_ids = list(app_hadm_info_firstdiag.keys())
cholec_hadm_ids = list(cholec_hadm_info_firstdiag.keys())
pancr_hadm_ids = list(pancr_hadm_info_firstdiag.keys())
divert_hadm_ids = list(divert_hadm_info_firstdiag.keys())

import pickle
id_difficulty = pickle.load(
        open(join(base_new, "id_difficulty.pkl"), "rb")
)

In [None]:
print(f"There are {len(app_hadm_info)} appendicitis patients")
print(f"There are {len(cholec_hadm_info)} cholecystitis patients")
print(f"There are {len(pancr_hadm_info)} pancreatitis patients")
print(f"There are {len(divert_hadm_info)} diverticulitis patients")
print(f"There are {len(app_hadm_info) + len(cholec_hadm_info) + len(pancr_hadm_info) + len(divert_hadm_info)} patients")
print('---')

print(f"There are {len(app_hadm_info_clean)} appendicitis patients with clean data")
print(f"There are {len(cholec_hadm_info_clean)} cholecystitis patients with clean data")
print(f"There are {len(pancr_hadm_info_clean)} pancreatitis patients with clean data")
print(f"There are {len(divert_hadm_info_clean)} diverticulitis patients with clean data")
print(f"There are {len(app_hadm_info_clean) + len(cholec_hadm_info_clean) + len(pancr_hadm_info_clean) + len(divert_hadm_info_clean)} patients with clean data")
print('---')

print(f"There are {len(app_hadm_info_firstdiag)} appendicitis patients with first diagnosis")
print(f"There are {len(cholec_hadm_info_firstdiag)} cholecystitis patients with first diagnosis")
print(f"There are {len(pancr_hadm_info_firstdiag)} pancreatitis patients with first diagnosis")
print(f"There are {len(divert_hadm_info_firstdiag)} diverticulitis patients with first diagnosis")
print(f"There are {len(app_hadm_info_firstdiag) + len(cholec_hadm_info_firstdiag) + len(pancr_hadm_info_firstdiag) + len(divert_hadm_info_firstdiag)} patients with first diagnosis")

#### Create the dataset for physionet upload and sharing

In [None]:
def write_csv(hadm_info, filename, fields):
    with open(join(base_new, filename), 'a') as f:
        for _id in hadm_info.keys():
            f.write(f'{_id},"')
            # Replace internal double quotes with two double quotes
            f.write('","'.join([str(hadm_info[_id][field]).replace('"', '""').strip() for field in fields]))
            f.write('"\n')

with open(join(base_new, 'history_of_present_illness.csv'), "w") as f:
    f.write('hadm_id,hpi\n')

with open(join(base_new, 'physical_examination.csv'), "w") as f:
    f.write('hadm_id,pe\n')

with open(join(base_new, 'laboratory_tests.csv'), 'w') as f:
    f.write('hadm_id,itemid,valuestr,ref_range_lower,ref_range_upper\n')

with open(join(base_new, 'microbiology.csv'), 'w') as f:
    f.write('hadm_id,test_itemid,valuestr,spec_itemid\n')

with open(join(base_new, 'radiology_reports.csv'), 'w') as f:
        f.write('hadm_id,note_id,modality,region,exam_name,text\n')

with open(join(base_new, 'discharge_diagnosis.csv'), "w") as f:
    f.write('hadm_id,discharge_diagnosis\n')

with open(join(base_new, 'icd_diagnosis.csv'), 'w') as f:
    f.write('hadm_id,icd_diagnosis\n')

with open(join(base_new, 'discharge_procedures.csv'), 'w') as f:
    f.write('hadm_id,discharge_procedure\n')

with open(join(base_new, 'icd_procedures.csv'), 'w') as f:
    f.write('hadm_id,icd_code,icd_version,icd_title\n')
    
for hadm_info in [app_hadm_info_firstdiag, cholec_hadm_info_firstdiag, pancr_hadm_info_firstdiag, divert_hadm_info_firstdiag]:
    fields = ["Patient History"]
    write_csv(hadm_info, 'history_of_present_illness.csv', fields)

    fields = ["Physical Examination"]    
    write_csv(hadm_info, 'physical_examination.csv', fields)

    with open(join(base_new, 'laboratory_tests.csv'), 'a') as f:
        for _id in hadm_info.keys():
            for itemid, value in hadm_info[_id]["Laboratory Tests"].items():
                rr_lower = hadm_info[_id]["Reference Range Lower"][itemid]
                rr_upper = hadm_info[_id]["Reference Range Upper"][itemid]
                if pd.isna(rr_lower):
                    rr_lower = ''
                if pd.isna(rr_upper):
                    rr_upper = ''
                value = value.replace('"', '""')
                f.write(f'{_id},{itemid},"{value}",{rr_lower},{rr_upper}\n')

    with open(join(base_new, 'microbiology.csv'), 'a') as f:
        for _id in hadm_info.keys():
            for itemid, value in hadm_info[_id]["Microbiology"].items():
                value = value.replace('"', '""')
                f.write(f'{_id},{itemid},"{value.strip()}",{hadm_info[_id]["Microbiology Spec"][itemid]}\n')

    with open(join(base_new, 'radiology_reports.csv'), 'a') as f:
        for _id in hadm_info.keys():
            for item in hadm_info[_id]["Radiology"]:
                report = item["Report"].replace('"', '""')
                f.write(f'{_id},{item["Note ID"]},"{item["Modality"]}","{item["Region"]}","{item["Exam Name"]}","{report}"\n')

    fields = ["Discharge Diagnosis"]    
    write_csv(hadm_info, 'discharge_diagnosis.csv', fields)

    with open(join(base_new, 'icd_diagnosis.csv'), 'a') as f:
        for _id in hadm_info.keys():
            for diagnosis in hadm_info[_id]["ICD Diagnosis"]:
                diagnosis = diagnosis.replace('"', '""')
                f.write(f'{_id},"{diagnosis}"\n')
        
    with open(join(base_new, 'discharge_procedures.csv'), 'a') as f:
        for _id in hadm_info.keys():
            for procedure in hadm_info[_id]["Procedures Discharge"]:
                procedure = procedure.replace('"', '""')
                if procedure.lower() == "none" or procedure == "___" or procedure.lower() == "n/a" or len(procedure) == 1:
                    continue
                f.write(f'{_id},"{procedure}"\n')

    with open(join(base_new, 'icd_procedures.csv'), 'a') as f:
        for _id in hadm_info.keys():
            for indx, procedure_code in enumerate(hadm_info[_id]["Procedures ICD9"]):
                procedure = hadm_info[_id]["Procedures ICD9 Title"][indx]
                procedure = procedure.replace('"', '""')
                f.write(f'{_id},{procedure_code},"{procedure}",9\n')
            for indx, procedure_code in enumerate(hadm_info[_id]["Procedures ICD10"]):
                procedure = hadm_info[_id]["Procedures ICD10 Title"][indx]
                procedure = procedure.replace('"', '""')
                f.write(f'{_id},{procedure_code},"{procedure}",10\n')

In [None]:
from dataset.labs import generate_lab_test_mapping

MIMIC_hosp_base = ""

generate_lab_test_mapping(MIMIC_hosp_base)

lab_test_mapping_df = pickle.load(open(join(MIMIC_hosp_base,'lab_test_mapping.pkl'), 'rb'))
lab_test_mapping_df.to_csv(join(base_new, 'lab_test_mapping.csv'), index=False)

### Convert physionet dataset back into project format

In [None]:
import ast
import pickle

base_new = "./hosp"
lab_test_mapping_df = pd.read_csv(join(base_new, 'lab_test_mapping.csv'))
lab_test_mapping_df['corresponding_ids'] = lab_test_mapping_df['corresponding_ids'].apply(ast.literal_eval)
lab_test_mapping_df['corresponding_ids'] = lab_test_mapping_df['corresponding_ids'].apply(lambda x: [int(i) for i in x])
pickle.dump(lab_test_mapping_df, open(join(base_new, 'lab_test_mapping.pkl'), 'wb'))

In [None]:
import pandas as pd
from os.path import join

def update_hadm(base_new, filename, hadm_info, key, hadm_name, _list=False):
    df = pd.read_csv(join(base_new, filename))
    for _, row in df.iterrows():
        _id = row['hadm_id']
        if _list:
            if hadm_name not in hadm_info[_id]:
                hadm_info[_id][hadm_name] = []
            hadm_info[_id][hadm_name].append(row[key])
        # For single value fields
        else:
            hadm_info[_id][hadm_name] = row[key]
    return hadm_info

hadm_info = {}

# Create entries for all hadm_ids
hpi_df = pd.read_csv(join(base_new, 'history_of_present_illness.csv'))
hadm_ids = hpi_df["hadm_id"].to_list()
for _id in hadm_ids:
    hadm_info[_id] = {}

hadm_info = update_hadm(base_new, 'history_of_present_illness.csv', hadm_info, 'hpi', 'Patient History')

hadm_info = update_hadm(base_new, 'physical_examination.csv', hadm_info, 'pe', 'Physical Examination')

lab_events_df = pd.read_csv(join(base_new, 'laboratory_tests.csv'))
for _, row in lab_events_df.iterrows():
    _id = row['hadm_id']
    if "Laboratory Tests" not in hadm_info[_id]:
        hadm_info[_id]["Laboratory Tests"] = {}
        hadm_info[_id]["Reference Range Lower"] = {}
        hadm_info[_id]["Reference Range Upper"] = {}
    hadm_info[_id]["Laboratory Tests"][row['itemid']] = row['valuestr']
    hadm_info[_id]["Reference Range Lower"][row['itemid']] = row['ref_range_lower']
    hadm_info[_id]["Reference Range Upper"][row['itemid']] = row['ref_range_upper']

microbiology_df = pd.read_csv(join(base_new, 'microbiology.csv'))
for _, row in microbiology_df.iterrows():
    _id = row['hadm_id']
    if "Microbiology" not in hadm_info[_id]:
        hadm_info[_id]["Microbiology"] = {}
        hadm_info[_id]["Microbiology Spec"] = {}
    hadm_info[_id]["Microbiology"][row['test_itemid']] = row['valuestr']
    hadm_info[_id]["Microbiology Spec"][row['test_itemid']] = row['spec_itemid']

radiology_df = pd.read_csv(join(base_new, 'radiology_reports.csv'))
for _, row in radiology_df.iterrows():
    _id = row['hadm_id']
    if "Radiology" not in hadm_info[_id]:
        hadm_info[_id]["Radiology"] = []
    hadm_info[_id]["Radiology"].append({"Note ID": row['note_id'], "Modality": row['modality'], "Region": row['region'], "Exam Name": row['exam_name'], "Report": row['text']})

hadm_info = update_hadm(base_new, 'discharge_diagnosis.csv', hadm_info, 'discharge_diagnosis', 'Discharge Diagnosis')

hadm_info = update_hadm(base_new, 'icd_diagnosis.csv', hadm_info, 'icd_diagnosis', 'ICD Diagnosis', _list=True)

hadm_info = update_hadm(base_new, 'discharge_procedures.csv', hadm_info, 'discharge_procedure', 'Procedures Discharge', _list=True)

icd_procedures_df = pd.read_csv(join(base_new, 'icd_procedures.csv'))
for _, row in icd_procedures_df.iterrows():
    _id = row['hadm_id']
    if "Procedures ICD9" not in hadm_info[_id]:
        hadm_info[_id]["Procedures ICD9"] = []
        hadm_info[_id]["Procedures ICD9 Title"] = []
        hadm_info[_id]["Procedures ICD10"] = []
        hadm_info[_id]["Procedures ICD10 Title"] = []
    if row['icd_version'] == 9:
        hadm_info[_id]["Procedures ICD9"].append(row['icd_code'])
        hadm_info[_id]["Procedures ICD9 Title"].append(row['icd_title'])
    else:
        hadm_info[_id]["Procedures ICD10"].append(row['icd_code'])
        hadm_info[_id]["Procedures ICD10 Title"].append(row['icd_title'])

In [None]:
from utils.nlp import extract_primary_diagnosis

patho_ids = {"appendicitis": [], "cholecystitis": [], "pancreatitis": [], "diverticulitis": []}

for _id, hadm in hadm_info.items():
    possible_patho = ""
    possible_patho_index = float("Inf")
    for patho in patho_ids.keys():
        primary_diag = extract_primary_diagnosis(hadm["Discharge Diagnosis"].lower())
        if patho in primary_diag.lower():
            new_patho_index = primary_diag.lower().find(patho)
            if new_patho_index < possible_patho_index:
                # Sanity check
                if possible_patho:
                    for patho_hadm_info, fd_hadm_info in zip(["appendicitis","diverticulitis","cholecystitis","pancreatitis"],[app_hadm_info_firstdiag, cholec_hadm_info_firstdiag, pancr_hadm_info_firstdiag, divert_hadm_info_firstdiag]):
                        if _id in fd_hadm_info:
                            print(patho_hadm_info)
                    print(_id)
                    print(f'Found multiple pathologies: {possible_patho} and {patho}')
                    print(primary_diag)
                    print("-----")
                possible_patho = patho
                possible_patho_index = new_patho_index
    # Sanity check
    if not possible_patho or possible_patho == "":
        print(_id)
        print(f'Could not find pathology in {primary_diag}')
        print("-----")
    patho_ids[possible_patho].append(_id)

In [None]:
# Sanity check
for key in patho_ids.keys():
    print(f'{key}: {len(patho_ids[key])}')
print(f'Total: {sum([len(patho_ids[key]) for key in patho_ids.keys()])}')

In [None]:
app_hadm_info_firstdiag = {}
cholec_hadm_info_firstdiag = {}
pancr_hadm_info_firstdiag = {}
divert_hadm_info_firstdiag = {}

for _id in patho_ids["appendicitis"]:
    app_hadm_info_firstdiag[_id] = hadm_info[_id]

for _id in patho_ids["cholecystitis"]:
    cholec_hadm_info_firstdiag[_id] = hadm_info[_id]

for _id in patho_ids["diverticulitis"]:
    divert_hadm_info_firstdiag[_id] = hadm_info[_id]

for _id in patho_ids["pancreatitis"]:
    pancr_hadm_info_firstdiag[_id] = hadm_info[_id]

In [None]:
from dataset.utils import write_hadm_to_file

write_hadm_to_file(app_hadm_info_firstdiag, 'appendicitis_hadm_info_first_diag', base_new)
write_hadm_to_file(cholec_hadm_info_firstdiag, 'cholecystitis_hadm_info_first_diag', base_new)
write_hadm_to_file(pancr_hadm_info_firstdiag, 'pancreatitis_hadm_info_first_diag', base_new)
write_hadm_to_file(divert_hadm_info_firstdiag, 'diverticulitis_hadm_info_first_diag', base_new)

### Dataset Stats

#### Lab Events and Radiology

In [None]:
lab_events_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'labevents.csv'))

In [None]:
# Get average number of lab_events per hadm_id
lab_events_subset = lab_events_df[lab_events_df['hadm_id'].isin(app_hadm_ids) | lab_events_df['hadm_id'].isin(cholec_hadm_ids) | lab_events_df['hadm_id'].isin(pancr_hadm_ids) | lab_events_df['hadm_id'].isin(divert_hadm_ids)]
lab_events_subset.groupby('hadm_id').count().mean()

In [None]:
radiology_report_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'radiology_reports.csv'))
radiology_report_details_df = pd.read_csv(join(base_mimic, 'hosp', 'ClinicalBenchmark', 'radiology_report_details.csv'))

In [None]:
# Get average number of radiology reports per hadm_id
radiology_report_subset = radiology_report_df[radiology_report_df['hadm_id'].isin(app_hadm_ids) | radiology_report_df['hadm_id'].isin(cholec_hadm_ids) | radiology_report_df['hadm_id'].isin(pancr_hadm_ids) | radiology_report_df['hadm_id'].isin(divert_hadm_ids)]
radiology_report_subset.groupby('hadm_id').count().mean()

#### Dilated Appendix

In [None]:
count = 0
context = {}
actions = ['dilat', 'enlarg', 'perforat', 'rupt', 'distend', 'fluid-filled', 'fluid filled', 'fluid collection']
for _id in app_hadm_info_firstdiag:
    hadm = app_hadm_info_firstdiag[_id]
    for rad in hadm['Radiology']:
        report = ' '.join(rad['Report'].split('\n'))
        sentences = report.split('. ')
        for sentence in sentences:
            sentence = sentence.lower()
            if 'appendi' in sentence and any([action in sentence for action in actions]):
                context[_id] = sentence
                count += 1
    if _id not in context:
        for rad in hadm['Radiology']:
            print(rad['Report'])
            print('---')
print(len(context))
print(len(app_hadm_info_firstdiag))
display(context)

#### Dataset Numbers

In [None]:
print(len(id_difficulty['appendicitis']['first_diag']) + len(id_difficulty['pancreatitis']['first_diag']) + len(id_difficulty['cholecystitis']['first_diag']) + len(id_difficulty['diverticulitis']['first_diag']))
unique_lab = set()
unique_microbio = set()
total_lab = 0
total_microbio = 0
radiology = {}
unique_procedures = set()
total_procedures = 0
for patho, hadm_info in zip(['appendicitis', 'pancreatitis', 'cholecystitis', 'diverticulitis'], [app_hadm_info_firstdiag, pancr_hadm_info_firstdiag, cholec_hadm_info_firstdiag, divert_hadm_info_firstdiag]):
    for patient_id in id_difficulty[patho]['first_diag']:
        unique_lab.update(hadm_info[patient_id]['Laboratory Tests'].keys())
        total_lab += len(hadm_info[patient_id]['Laboratory Tests'].keys())
        unique_microbio.update(hadm_info[patient_id]['Microbiology'].keys())
        total_microbio += len(hadm_info[patient_id]['Microbiology'].keys())
        for r in hadm_info[patient_id]['Radiology']:
            rad = r['Modality'] + ' ' + r['Region']
            count = radiology.get(rad, 0)
            radiology[rad] = count + 1
        unique_procedures.update(hadm_info[patient_id]['Procedures ICD10'])
        total_procedures += len(hadm_info[patient_id]['Procedures ICD10'])
        unique_procedures.update(hadm_info[patient_id]['Procedures ICD9'])
        total_procedures += len(hadm_info[patient_id]['Procedures ICD9'])
    print(f'{patho} {len(id_difficulty[patho]["first_diag"])}')


print(f'{len(unique_lab)} unique lab tests')
print(f'{total_lab} total lab tests')

print(f'{len(unique_microbio)} unique microbiology tests')
print(f'{total_microbio} total microbiology tests')

# sort radiology by count
radiology = {k: v for k, v in sorted(radiology.items(), key=lambda item: item[1], reverse=True)}
display(radiology)
# print total radiology count
total_rad = 0
for rad, count in radiology.items():
    total_rad += count
print(f'{total_rad} total radiology reports')

print(f'{len(unique_procedures)} unique procedures')
print(f'{total_procedures} total procedures')

#### Age and Sex

In [None]:
import pandas as pd
from os.path import join
MIMIC_hosp_base = ''
admissions_df = pd.read_csv(join(MIMIC_hosp_base, 'admissions.csv'))
patients_df = pd.read_csv(join(MIMIC_hosp_base, 'patients.csv'))

In [None]:
import pandas as pd
from collections import Counter

def calculate_statistics(data):
    ages = [info['Age'] for info in data.values()]
    genders = [info['Gender'] for info in data.values()]
    races = [info['Race'] for info in data.values()]

    age_median = pd.Series(ages).median()
    age_min = min(ages)
    age_max = max(ages)
    gender_counts = Counter(genders)
    race_counts = Counter(races)

    # Convert counts to percentages
    total = len(genders)
    gender_percentages = {gender: count / total * 100 for gender, count in gender_counts.items()}
    race_percentages = {race: count / total * 100 for race, count in race_counts.items()}

    return age_median, (age_min, age_max), gender_percentages, race_percentages


# Define the order of genders and races
gender_order = ['F', 'M']
race_order = ['WHITE', 'BLACK', 'HISPANIC', 'ASIAN', 'OTHER']

race_dict = {
    "WHITE": "WHITE",
    "BLACK": "BLACK",
    "ASIAN": "ASIAN",
    "HISPANIC": "HISPANIC",
    "OTHER": "OTHER",
    "HISPANIC OR LATINO": "HISPANIC",
    "UNKNOWN": "OTHER",
    "UNABLE TO OBTAIN": "OTHER",
    "MULTIPLE RACE": "OTHER",
    "SOUTH AMERICAN": "OTHER",
    "AMERICAN INDIAN": "OTHER",
    "PORTUGUESE": "OTHER",
    "PATIENT DECLINED TO ANSWER": "OTHER",
}
dataset_stats = {}

full_dataset = [app_hadm_info_firstdiag, cholec_hadm_info_firstdiag, pancr_hadm_info_firstdiag, divert_hadm_info_firstdiag]

app_hadm_info_dreval = {}
cholec_hadm_info_dreval = {}
pancr_hadm_info_dreval = {}
divert_hadm_info_dreval = {}

for _id in id_difficulty['appendicitis']['dr_eval']:
    app_hadm_info_dreval[_id] = app_hadm_info_firstdiag[_id]

for _id in id_difficulty['cholecystitis']['dr_eval']:
    cholec_hadm_info_dreval[_id] = cholec_hadm_info_firstdiag[_id]

for _id in id_difficulty['pancreatitis']['dr_eval']:
    pancr_hadm_info_dreval[_id] = pancr_hadm_info_firstdiag[_id]

for _id in id_difficulty['diverticulitis']['dr_eval']:
    divert_hadm_info_dreval[_id] = divert_hadm_info_firstdiag[_id]

dreval_dataset = [app_hadm_info_dreval, cholec_hadm_info_dreval, pancr_hadm_info_dreval, divert_hadm_info_dreval]
    
for group in [full_dataset, dreval_dataset]:
    for patho, hadm_info in zip(['appendicitis', 'cholecystitis', 'pancreatitis', 'diverticulitis'],
                                group):
        dataset_stats[patho] = {}
        for _id in hadm_info.keys():
            subject_id = admissions_df.loc[admissions_df['hadm_id'] == _id, 'subject_id'].values[0]
            anchor_age = patients_df.loc[patients_df['subject_id'] == subject_id, 'anchor_age'].values[0]
            anchor_year = patients_df.loc[patients_df['subject_id'] == subject_id, 'anchor_year'].values[0]
            admittime = admissions_df.loc[admissions_df['hadm_id'] == _id, 'admittime'].values[0]
            admityear = admittime.split('-')[0]
            hadm_age = int(admityear) - int(anchor_year) + int(anchor_age)
            subject_gender = patients_df.loc[patients_df['subject_id'] == subject_id, 'gender'].values[0]
            subject_race = race_dict[admissions_df.loc[admissions_df['hadm_id'] == _id, 'race'].values[0].split('-')[0].split('/')[0].strip()]
            dataset_stats[patho][_id] = {
                "Age": hadm_age,
                "Gender": subject_gender,
                "Race": subject_race
            }
        
    # Calculate statistics for each disease
    stats = {disease: calculate_statistics(data) for disease, data in dataset_stats.items()}

    # Format for LaTeX table
    for disease, (age_median, age_range, gender_percentages, race_percentages) in stats.items():
        # Sort and format gender distribution
        gender_distribution = ", ".join([f"{gender} ({gender_percentages.get(gender, 0):.1f}\%)" for gender in gender_order])

        # Sort and format race distribution
        race_distribution = " \\newline ".join([f"{race} ({race_percentages.get(race, 0):.1f}\%)" for race in race_order])

        print(f"{disease.capitalize()} & {age_median} & {age_range[0]}-{age_range[1]} & {gender_distribution} & {race_distribution} \\\\ \\hline")
    print()

## Other

#### Get diagnoses of patients with abdominal pain

In [None]:
# Find patients with abdominal pain in note text as chief complaint
abd_complaint_df = discharge_df[discharge_df['text'].str.contains('chief complaint:\s*abdominal pain', case=False)]

# Get all unique diagnoses for these patients
abd_complaint_diagnoses_df = diag_icd[diag_icd['hadm_id'].isin(abd_complaint_df['hadm_id'])]
abd_complaint_diagnoses_df.value_counts('long_title')

In [None]:
abd_complaint_diagnoses_df[abd_complaint_diagnoses_df['long_title'].str.contains('Inguinal hernia, with obstruction')]

#### Create dataset of patients with abdominal pain but different final diagnosis

In [None]:
# Acute gastritis
gastritis_hadm_ids = extract_hadm_ids('Acute gastritis', diag_icd, discharge_df, diag_counts=30, cc=10)
gastritis_hadm_info, gastritis_hadm_info_clean = extract_info(gastritis_hadm_ids, 'gastritis', ['acute gastritis', 'gastritis'],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

In [None]:
# Urinary tract infection
uti_hadm_ids = extract_hadm_ids_filter_cc('Urinary tract infection', diag_icd, discharge_df, diag_counts=30, cc=10)
uti_hadm_info, uti_hadm_info_clean = extract_info(uti_hadm_ids, 'urinary tract infection', ['urinary tract infection', 'uti'],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

In [None]:
esophageal_reflux_hadm_ids = extract_hadm_ids_filter_cc('Esophageal reflux', diag_icd, discharge_df, diag_counts=30, cc=10)
esophageal_reflux_hadm_info, esophageal_reflux_hadm_info_clean = extract_info(esophageal_reflux_hadm_ids, 'esophageal reflux', ['esophageal reflux'],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

In [None]:
# Inguinal hernia, with obstruction
hernia_hadm_ids = extract_hadm_ids('Inguinal hernia, with obstruction', diag_icd, discharge_df, diag_counts=30, cc=10)
hernia_hadm_info, hernia_hadm_info_clean = extract_info(hernia_hadm_ids, 'hernia', [],
                                                             discharge_df, admissions_df, transfers_df,lab_events_df, microbiology_df, radiology_report_df, radiology_report_details_df,
                                                             diag_icd, procedures_df)

#### Find patients in ED with abd pain but sent home

In [None]:
# Find patients with abdominal pain but sent home
base_ed = join(base_mimic, 'ed')
ed_diagnosis_df = pd.read_csv(join(base_ed, 'diagnosis.csv'))
abdominal_pain_stays_ids = ed_diagnosis_df[ed_diagnosis_df['icd_code'].str.startswith('789') | ed_diagnosis_df['icd_code'].str.startswith('R10')]['stay_id'].unique()

edstays_df = pd.read_csv(join(base_ed, 'edstays.csv'))
abdominal_pain_stays_df = edstays_df[edstays_df['stay_id'].isin(abdominal_pain_stays_ids)]

abd_pain_home_stay_ids = abdominal_pain_stays_df[abdominal_pain_stays_df['disposition']=='HOME']['stay_id']

print('{} patients had abdominal pain but were sent home'.format(abdominal_pain_stays_df[abdominal_pain_stays_df['disposition']=='HOME']['stay_id'].nunique()))

# Find supplementary info for patients with abdominal pain but sent home
triage_df = pd.read_csv(join(base_ed, 'triage.csv'))
triage_df = triage_df[triage_df['stay_id'].isin(abd_pain_home_stay_ids)]
triage_df

## Generate Lab Test Mapping

In [None]:
from dataset.labs import generate_lab_test_mapping
generate_lab_test_mapping()