In [13]:
import pandas as pd
from scipy import stats
DATA_DIR = 'data/mimic/'
CODELIST_DIR = 'codelists/output/omop/'

In [14]:
# Specify which diagnoses we're looking for in the MIMIC dataset
# [codelist name, ICD version, ICD code]
target_diagnoses = [['colorectal', 9, '153'],
                    ['colorectal', 10, 'C18'],
                    ['lung',       9, '162'],
                    ['lung',       10, 'C33'],
                    ['lung',       10, 'C34'] ]

# Create a dictionary mapping ICD codes to descriptions
icd_dictionary = pd.read_csv(DATA_DIR + 'd_icd_procedures.csv')
icd_dictionary

icd_to_text = dict(zip(icd_dictionary['icd_code'],icd_dictionary['long_title']))

In [15]:
diagnoses = pd.read_csv(DATA_DIR + 'diagnoses_icd.csv')

In [36]:
# Specify which codelists to compare
codelist_names=['colorectal','lung']

# Read in MIMIC procedures
procedures = pd.read_csv(DATA_DIR + 'procedures_icd.csv')
# Remove any cases of the same ICD code twice for one patient
procedures = procedures[ ~ procedures[['subject_id', 'icd_code']].duplicated() ]
procedures.head()

all_patients = list( procedures.subject_id.unique() )


for c in codelist_names:
    # Read in ICD9 and ICD10 codelists
    icd9_codelist = pd.read_csv(CODELIST_DIR + 'ukquery_'+c+'_0_snomed_icd9.csv')
    icd10_codelist = pd.read_csv(CODELIST_DIR + 'ukquery_'+c+'_0_snomed_icd10.csv')
    
    # remove dots from icd9 codes to match MIMIC format
    icd9_codelist['icd'] = [str(i).replace('.','') for i in icd9_codelist['icd']]

    print('Codelist:',c)
    print('  Imported',len(icd9_codelist),' ICD-9 Vol3 codes')
    print('  Imported',len(icd10_codelist),' ICD-10-PCS codes')


    # Find patients in the MIMIC diagnoses table with our target diagnoses
    found_diagnoses_ix = diagnoses['icd_code']==False
    for d in target_diagnoses:
        if d[0]==c:
            found_diagnoses_ix = found_diagnoses_ix | ( (diagnoses['icd_version']==d[1]) & (diagnoses['icd_code'].str.startswith(d[2])) ) 

    found_diagnoses_ix
    found_diagnoses = diagnoses[found_diagnoses_ix]
    case_patients = list( found_diagnoses['subject_id'].unique() )
    print('  Found',len(case_patients),'patients in MIMIC')






    case_patient_procedures = procedures[ procedures['subject_id'].isin(case_patients) ]
    print('   having',len(case_patient_procedures),'procedures')

    #icd_code_frequency = colon_patient_procedures['icd_code'].value_counts()


    # Make a list of control patients (being any patient without the target diagnoses) and their procedures
    control_patients = [p for p in all_patients if p not in case_patients]
    control_patient_procedures = procedures[procedures['subject_id'].isin(control_patients)]


    # Get the number of patients with each code in the case and control cohorts
    # We removed duplicate codes within each patient earlier so number of patients with code X == code X's original count
    case_counts    = pd.DataFrame( case_patient_procedures['icd_code'].value_counts() ).reset_index()
    control_counts = pd.DataFrame( control_patient_procedures['icd_code'].value_counts() ).reset_index()

    freq_table = control_counts.merge(case_counts, on='icd_code', suffixes=('_control','_case'), how='outer').fillna(0)

    freq_table['description'] = [icd_to_text.get(i) for i in freq_table['icd_code']]

    # Get the number of patients without each code
    freq_table['control_without'] = len(control_patients) - freq_table['count_control']
    freq_table['case_without'] = len(case_patients) - freq_table['count_case']

    # Calculate prevalence (% of patients with code X)
    freq_table['control_prevalence'] = ( freq_table['count_control'] / len(control_patients) ) *100
    freq_table['case_prevalence'] = ( freq_table['count_case'] / len(case_patients) ) *100

    # Calculate an odds ratio
    freq_table['odds_ratio'] = ( freq_table['count_case'] * freq_table['control_without'] ) / ( freq_table['case_without'] * freq_table['count_control'] )
    
    # Check if this code is in our generated codelist
    freq_table['in_codelist'] = freq_table['icd_code'].isin(icd9_codelist['icd']) | freq_table['icd_code'].isin(icd10_codelist['icd'])

    # Perform Fisher's exact test for each code
    test_results = dict()
    for index,row in freq_table.iterrows():
        table = [[row.count_case, row.case_without], [row.count_control, row.control_without]]

        fisher = stats.fisher_exact(table, alternative='two-sided')

        test_results[index] = fisher
    freq_table['fisher_exact'] = [i[1] for i in test_results.values()]

    display(freq_table.head())

    # Apply filters
    correlated_codes = freq_table
    correlated_codes = freq_table[ freq_table['odds_ratio']!=float('inf') ] # Ignore nans
    correlated_codes = correlated_codes[ correlated_codes['odds_ratio']>1 ] # Only codes that are more common in the case cohort
    correlated_codes = correlated_codes[ correlated_codes['count_case']>10 ] # and occur 10+ times in the case cohort
    correlated_codes = correlated_codes[ correlated_codes['in_codelist']==False ] # and aren't in the original codelist
    correlated_codes = correlated_codes[ correlated_codes['fisher_exact']<0.05 ] # and p<0.05

    # Save to csv
    correlated_codes.to_csv('correlated_codes_'+c+'.csv')

Codelist: colorectal
  Imported 121  ICD-9 Vol3 codes
  Imported 3151  ICD-10-PCS codes
  Found 1046 patients in MIMIC
   having 6419 procedures


Unnamed: 0,icd_code,count_control,count_case,description,control_without,case_without,control_prevalence,case_prevalence,odds_ratio,in_codelist,fisher_exact
0,3893,9574.0,97.0,"Venous catheterization, not elsewhere classified",111332.0,949.0,7.918548,9.273423,1.18859,False,0.107567
1,8938,8350.0,38.0,Other nonoperative respiratory measurements,112556.0,1008.0,6.906192,3.632887,0.508166,False,9e-06
2,02HV33Z,7769.0,88.0,Insertion of Infusion Device into Superior Ven...,113137.0,958.0,6.425653,8.413002,1.337694,False,0.011306
3,8856,7398.0,41.0,Coronary arteriography using two catheters,113508.0,1005.0,6.118803,3.919694,0.625936,False,0.002231
4,3897,6952.0,73.0,Central venous catheter placement with guidance,113954.0,973.0,5.749921,6.978967,1.229787,False,0.095265


Codelist: lung
  Imported 70  ICD-9 Vol3 codes
  Imported 3368  ICD-10-PCS codes
  Found 3207 patients in MIMIC
   having 17684 procedures


Unnamed: 0,icd_code,count_control,count_case,description,control_without,case_without,control_prevalence,case_prevalence,odds_ratio,in_codelist,fisher_exact
0,3893,9386.0,285.0,"Venous catheterization, not elsewhere classified",109613.0,2922.0,7.887461,8.88681,1.139059,False,0.03987583
1,8938,8268.0,120.0,Other nonoperative respiratory measurements,110731.0,3087.0,6.947958,3.741815,0.520611,False,2.089221e-14
2,02HV33Z,7633.0,224.0,Insertion of Infusion Device into Superior Ven...,111366.0,2983.0,6.41434,6.984721,1.0956,False,0.201502
3,8856,7320.0,119.0,Coronary arteriography using two catheters,111679.0,3088.0,6.151312,3.710633,0.587936,False,1.483287e-09
4,3897,6778.0,247.0,Central venous catheter placement with guidance,112221.0,2960.0,5.695846,7.701902,1.381586,False,3.822346e-06
