In [1]:
import pandas as pd
import sys
import os
import re 

sys.path.append("../scripts/mimic")

from mimic import get_stay_dict

In [2]:
data_dir = "/root/.data/llmtree/"
save_dir = "."
# get_stay_dict(os.path.join(data_dir,"mimic-iv-2.2", "hosp"), save_dir)

In [3]:
# load icd9/10 codes
icd9_diagnosis = pd.read_excel(os.path.join(data_dir, "icdcodes", "icd9_diagnosis.xlsx"))
icd9_diagnosis.columns=["diagnosis_code", "long_description", "short_description"]
icd9_diagnosis_dict = dict(zip(icd9_diagnosis.diagnosis_code, icd9_diagnosis.long_description))

icd9_procedure = pd.read_excel(os.path.join(data_dir, "icdcodes", "icd9_procedure.xlsx"))
icd9_procedure.columns=["procedure_code", "long_description", "short_description"]
icd9_procedure.procedure_code = icd9_procedure.procedure_code.astype(str) 
icd9_procedure_dict = dict(zip(icd9_procedure.procedure_code, icd9_procedure.long_description))

icd10_diagnosis_dict = {}
with open(os.path.join(data_dir, "icdcodes", "icd10_diagnosis.txt"), "r") as file:
    lines = file.readlines()
    codes = []
    sentences = []
    for line in lines:
        # only up to the first 8 characters in the line are the codes
        code = line[:8].strip(' ')
        description = line[8:].strip(' ').strip('\n')
        icd10_diagnosis_dict[code] = description

icd10_procedure_dict = {}
with open(os.path.join(data_dir, "icdcodes", "icd10_procedure.txt"), "r") as file:
    lines = file.readlines()
    codes = []
    sentences = []
    for line in lines:
        ## HACK
        # there is a strict separation between descrpitions after 77 characters
        # so manually split the line there and add whitespace so that the regex works
        new_line = line[:77] + "         " + line[77:]
        match = re.search("^\d{5}\s+([A-Za-z0-9]+)\s+[01]{1}\s+.+\s+(\s+.+)", new_line)
        code = match.group(1).strip(' ')
        description = match.group(2).strip(' ').strip('\n')
        icd10_procedure_dict[code] = description

In [4]:
# load data and match icd codes to descriptions
processed_mimic_df = pd.read_csv(os.path.join(data_dir, "mimic-iv-2.2", "hosp", "processed_mimic_data.csv"))

untracked_codes = set()
def diagnosis_to_description(codes, code_type='diagnosis'):
    descriptions = []
    if code_type == 'diagnosis':
        icd9_dict = icd9_diagnosis_dict
        icd10_dict = icd10_diagnosis_dict
    elif code_type == 'procedure':
        icd9_dict = icd9_procedure_dict
        icd10_dict = icd10_procedure_dict
    else:
        raise ValueError(f"code type {code_type} not understood")

    for icd_code in codes.split(' <sep> '):
        code_type = re.search("ICD([0-9]+)_", icd_code).group(1)
        code_number = re.search("_(.*)", icd_code).group(1)
        if int(code_type) == 9:
            try:
                description = icd9_dict[code_number]
            except:
                try:
                    new_code_number = code_number + '0'
                    description = icd9_dict[new_code_number]
                except:
                    try:
                        new_code_number = code_number + '1'
                        description = icd9_dict[new_code_number]
                    except:
                        untracked_codes.add(icd_code)
                        continue
        elif int(code_type) == 10:
            try:
                description = icd10_dict[code_number]
            except:
                try:
                    new_code_number = code_number + '0'
                    description = icd10_dict[new_code_number]
                except:
                    try:
                        new_code_number = code_number + '1'
                        description = icd10_dict[new_code_number]
                    except:
                        untracked_codes.add(icd_code)
                        continue
        else:
            print(icd_code)
            print(code_type, code_number)
            raise ValueError("code type not understood")
        descriptions.append(description)

    return descriptions

processed_mimic_df["diagnoses_long_description"] = processed_mimic_df.diagnoses.apply(lambda x: diagnosis_to_description(x, code_type='diagnosis'))
processed_mimic_df["procedure_long_description"] = processed_mimic_df.procedure.apply(lambda x: diagnosis_to_description(x, code_type='procedure'))

In [7]:
def find_empty_rows(row):
    if len(row.procedure_long_description) == 0:
        return 1
    return 0

# fraction of rows with no procedure description
processed_mimic_df.apply(find_empty_rows, axis=1).sum() / len(processed_mimic_df)

0.027761917393122237

In [5]:
# add in discharge notes
discharge_df = pd.read_csv(os.path.join(data_dir, "mimic-iv-note-2.2", "note", "discharge.csv"))
discharge_df = discharge_df[["hadm_id", "text"]]
discharge_df.rename(columns={"text": "discharge_notes"}, inplace=True)

In [6]:
df = processed_mimic_df.merge(discharge_df, how='inner', on='hadm_id')

In [7]:
df.head()

Unnamed: 0,subject_id,hadm_id,admittime,real_admit_year,age,gender,mortality,readmission,diagnoses,procedure,diagnoses_long_description,procedure_long_description,discharge_notes
0,10000032,22595853,2180-05-06,2014,52,F,0,0,ICD9_5723 <sep> ICD9_78959 <sep> ICD9_5715 <se...,ICD9_5491,"[Portal hypertension, Other ascites, Cirrhosis...",[Percutaneous abdominal drainage],\nName: ___ Unit No: _...
1,10000032,22841357,2180-06-26,2014,52,F,0,0,ICD9_07071 <sep> ICD9_78959 <sep> ICD9_2875 <s...,ICD9_5491,[Unspecified viral hepatitis C with hepatic co...,[Percutaneous abdominal drainage],\nName: ___ Unit No: _...
2,10000032,25742920,2180-08-05,2014,52,F,0,1,ICD9_07054 <sep> ICD9_78959 <sep> ICD9_V462 <s...,ICD9_5491,[Chronic hepatitis C without mention of hepati...,[Percutaneous abdominal drainage],\nName: ___ Unit No: _...
3,10000117,27988844,2183-09-18,2017,57,F,0,0,ICD10_S72012A <sep> ICD10_W010XXA <sep> ICD10_...,ICD10_0QS734Z,[Unspecified intracapsular fracture of left fe...,[Reposition Left Upper Femur with Internal Fix...,\nName: ___ Unit No: ___\n...
4,10000560,28979390,2189-10-15,2008,53,F,0,0,ICD9_1890 <sep> ICD9_V1582 <sep> ICD9_V1201,ICD9_5551,"[Malignant neoplasm of kidney, except pelvis, ...",[Nephroureterectomy],\nName: ___ Unit No: _...


In [9]:
print(df.iloc[0].discharge_notes)

 
Name:  ___                     Unit No:   ___
 
Admission Date:  ___              Discharge Date:   ___
 
Date of Birth:  ___             Sex:   F
 
Service: MEDICINE
 
Allergies: 
No Known Allergies / Adverse Drug Reactions
 
Attending: ___
 
Chief Complaint:
Worsening ABD distension and pain 
 
Major Surgical or Invasive Procedure:
Paracentesis

 
History of Present Illness:
___ HCV cirrhosis c/b ascites, hiv on ART, h/o IVDU, COPD, 
bioplar, PTSD, presented from OSH ED with worsening abd 
distension over past week.  
Pt reports self-discontinuing lasix and spirnolactone ___ weeks 
ago, because she feels like "they don't do anything" and that 
she "doesn't want to put more chemicals in her." She does not 
follow Na-restricted diets. In the past week, she notes that she 
has been having worsening abd distension and discomfort. She 
denies ___ edema, or SOB, or orthopnea. She denies f/c/n/v, d/c, 
dysuria. She had food poisoning a week ago from eating stale 
cake (n/v 20 min after fo