In [1]:
import pandas as pd
import numpy as np
from gensim.parsing.preprocessing import remove_stopwords
import nltk

nltk.download("punkt_tab")

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\Steven\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


True

In [2]:
data_path = './data/'

df_admissions = pd.read_csv(data_path + 'ADMISSIONS.csv')
df_diagnosis = pd.read_csv(data_path + 'DIAGNOSES_ICD.csv')
df_notes = pd.read_csv(data_path + 'NOTEEVENTS.csv', dtype='unicode')

In [3]:
include_codes = ['39891', '40201', '40211', '40291', '40401', '40403', '40411', '40413', '40491', '40493', '4280', '4281', 
                 '42820', '42821', '42822', '42823', '42830', '42831', '42832', '42833', '42840', '42841', '42842', '42843', '4289']

df_diagnosis_hf = df_diagnosis.loc[df_diagnosis['ICD9_CODE'].isin(include_codes), ['SUBJECT_ID', 'HADM_ID', 'ICD9_CODE']]
df_diagnosis_hf = df_diagnosis_hf.drop_duplicates(subset=['SUBJECT_ID', 'HADM_ID'])

In [4]:
df_notes = df_notes.loc[df_notes['HADM_ID'].notna(), ['SUBJECT_ID', 'HADM_ID', 'CATEGORY', 'TEXT']]
df_notes['HADM_ID'] = df_notes['HADM_ID'].astype(np.int64)
df_notes['SUBJECT_ID'] = df_notes['SUBJECT_ID'].astype(np.int64)
df_notes = df_notes.loc[df_notes['CATEGORY'] == 'Discharge summary', :]
df_notes['TEXT_LEN'] = df_notes['TEXT'].apply(len)
df_notes = df_notes.loc[df_notes.groupby(['SUBJECT_ID', 'HADM_ID'])['TEXT_LEN'].idxmax(), :]

In [5]:
df_admissions = df_admissions.loc[:, ['SUBJECT_ID', 'HADM_ID', 'ADMITTIME', 'DISCHTIME']]
df_admissions['ADMITTIME'] = pd.to_datetime(df_admissions['ADMITTIME'])
df_admissions['DISCHTIME'] = pd.to_datetime(df_admissions['DISCHTIME'])

In [6]:
df_data = df_diagnosis_hf.merge(df_notes, how='left', on=['HADM_ID', 'SUBJECT_ID'])
df_data = df_data.merge(df_admissions, how='left', on=['HADM_ID', 'SUBJECT_ID'])
df_data = df_data.sort_values(by=['SUBJECT_ID', 'ADMITTIME']).reset_index(drop=True)
df_data['GEN_RE'] = 0
df_data['30_RE'] = 0
for i in range(0, df_data.shape[0] - 1):
    if df_data.loc[i, 'SUBJECT_ID'] == df_data.loc[i + 1, 'SUBJECT_ID']:
        df_data.loc[i, 'GEN_RE'] = 1
        days_re = (df_data.loc[i + 1, 'ADMITTIME'] - df_data.loc[i, 'DISCHTIME']).days
        if days_re <= 30:
            df_data.loc[i, '30_RE'] = 1
            
print('Total number of admissions: {}'.format(df_data.shape[0]))
print('Total number of general readmissions: {}'.format(df_data['GEN_RE'].sum()))
print('Total number of 30-day readmissions: {}'.format(df_data['30_RE'].sum()))

Total number of admissions: 14040
Total number of general readmissions: 3604
Total number of 30-day readmissions: 969


In [7]:
df_data = df_data.loc[df_data['TEXT'].notna(), ['SUBJECT_ID', 'HADM_ID', 'TEXT', 'GEN_RE', '30_RE']]

print('Total number of admissions with discharge summaries: {}'.format(df_data.shape[0]))
print('Total number of general readmissions with discharge summaries: {}'.format(df_data['GEN_RE'].sum()))
print('Total number of 30-day readmissions with discharge summaries: {}'.format(df_data['30_RE'].sum()))

Total number of admissions with discharge summaries: 13755
Total number of general readmissions with discharge summaries: 3544
Total number of 30-day readmissions with discharge summaries: 963


In [8]:
df_data['CLEAN_TEXT'] = df_data['TEXT'].str.replace(r'[^A-Za-z0-9\s]', '', regex=True)
df_data['CLEAN_TEXT'] = df_data['CLEAN_TEXT'].str.replace(r'\d+', '', regex=True)
df_data['CLEAN_TEXT'] = df_data['CLEAN_TEXT'].str.lower()
df_data['CLEAN_TEXT'] = df_data['CLEAN_TEXT'].apply(remove_stopwords)

In [9]:
df_data.to_csv(data_path + 'data_processed.csv', index=False)
df_data.head(20)

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,GEN_RE,30_RE,CLEAN_TEXT
0,3,145834,Admission Date: [**2101-10-20**] Discharg...,0,0,admission date discharge date date birth sex m...
1,9,150750,Admission Date: [**2149-11-9**] Dischar...,0,0,admission date discharge date date birth sex m...
2,21,109451,Admission Date: [**2134-9-11**] ...,0,0,admission date discharge date service medicine...
3,26,197661,Admission Date: [**2126-5-6**] Discharge ...,0,0,admission date discharge date date birth sex m...
4,30,104557,Admission Date: [**2172-10-14**] Discha...,0,0,admission date discharge date service hospital...
6,34,144319,Admission Date: [**2191-2-23**] ...,0,0,admission date discharge date service medicine...
7,37,188670,Admission Date: [**2183-8-21**] Discharge...,0,0,admission date discharge date date birth sex m...
8,38,185910,Admission Date: [**2166-8-10**] ...,0,0,admission date discharge date date birth sex m...
9,42,119203,Admission Date: [**2116-4-29**] Dischar...,0,0,admission date discharge date date birth sex m...
10,49,190539,Admission Date: [**2186-11-21**] ...,0,0,admission date discharge date service cardioth...
