In [1]:
import pandas as pd
import numpy as np

In [2]:
df_admissions = pd.read_csv('./data/ADMISSIONS.csv')
df_diagnosis = pd.read_csv('./data/DIAGNOSES_ICD.csv')
df_notes = pd.read_csv('./data/NOTEEVENTS.csv', dtype='unicode')

In [3]:
df_notes = df_notes.loc[df_notes['HADM_ID'].notna(), ['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT']]
df_notes['HADM_ID'] = df_notes['HADM_ID'].astype(np.int64)
df_notes['SUBJECT_ID'] = df_notes['SUBJECT_ID'].astype(np.int64)

In [4]:
df_admissions = df_admissions.loc[:, ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']]

In [5]:
include_codes = ['39891', '40201', '40211', '40291', '40401', '40403', '40411', '40413', '40491', '40493', '4280', '4281', 
                 '42820', '42821', '42822', '42823', '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843', '4289']

df_diagnosis_hf = df_diagnosis.loc[df_diagnosis['ICD9_CODE'].isin(include_codes), ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]

In [6]:
df_data = df_diagnosis_hf.merge(df_notes, how='inner', on=['HADM_ID', 'SUBJECT_ID'])
df_data = df_data.loc[df_data['CATEGORY'] == 'Discharge summary', :]
df_data['TEXT_LEN'] = df_data['TEXT'].apply(len)
df_data = df_data.loc[df_data.groupby('HADM_ID')['TEXT_LEN'].idxmax(), :]
df_data = df_data.merge(df_admissions, how='inner', on=['HADM_ID', 'SUBJECT_ID'])
df_data = df_data.sort_values(by=['SUBJECT_ID', 'ADMITTIME']).reset_index(drop=True)
df_data['ADMITTIME'] = pd.to_datetime(df_data['ADMITTIME'])
df_data['DISCHTIME'] = pd.to_datetime(df_data['DISCHTIME'])

df_data['GEN_RE'] = 0
df_data['30_RE_BYADMIT'] = 0
df_data['30_RE_BYDISCH'] = 0
for i in range(1, df_data.shape[0]):
    if df_data.loc[i - 1, 'SUBJECT_ID'] == df_data.loc[i, 'SUBJECT_ID']:
        df_data.loc[i - 1, 'GEN_RE'] = 1
        days_re_admit = (df_data.loc[i, 'ADMITTIME'] - df_data.loc[i - 1, 'ADMITTIME']).days
        days_re_disch = (df_data.loc[i, 'ADMITTIME'] - df_data.loc[i - 1, 'DISCHTIME']).days
        if days_re_admit <= 30:
            df_data.loc[i - 1, '30_RE_BYADMIT'] = 1
        if days_re_disch <= 30:
            df_data.loc[i - 1, '30_RE_BYDISCH'] = 1

df_data = df_data.loc[:, ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'ADMITTIME', 'DISCHTIME', 'GEN_RE', '30_RE_BYADMIT', '30_RE_BYDISCH']]

In [8]:
print('Total number of datapoints: {}'.format(df_data.shape[0]))
print('Total number of general readmission samples: {}'.format(df_data['GEN_RE'].sum()))
print('Total number of 30-day readmission samples: {}'.format(df_data['30_RE_BYDISCH'].sum()))

Total number of datapoints: 13755
Total number of general readmission samples: 3503
Total number of 30-day readmission samples: 942
