In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint
import os
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn import metrics

pd.set_option('display.max_colwidth',100000) #https://stackoverflow.com/questions/54692405/output-truncation-in-google-colab
pd.set_option('display.max_rows', 100000)

# Below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

In [None]:
hosp_mort = pd.read_csv('Downloads/hosp_mort.csv')
lomv = pd.read_csv('Downloads/lomv.csv')
basic_table = pd.read_pickle('Downloads/basic_table.pkl')
icu_detail = pd.read_csv('Downloads/icu_detail.csv')
diagnoses = pd.read_csv('Downloads/diagnoses.csv')

In [None]:
from datetime import datetime
def convert_time(x):
  date_time_str = x
  if date_time_str[-1] == ':':
    date_time_str += '00'
  date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
  return date_time_obj

In [None]:
basic_table.columns.values

In [None]:
hosp_mort.columns.values

In [None]:
lomv.columns.values

In [None]:
hosp_mort = hosp_mort.drop(['Unnamed: 0'], axis=1)

In [None]:
mort = basic_table.merge(hosp_mort, how='inner', on='hadm_id')
print(len(mort))

In [None]:
lomv = lomv.drop(['Unnamed: 0'], axis=1)
lomv.starttime = lomv.starttime.apply(convert_time)
lomv.endtime = lomv.endtime.apply(convert_time)
lomv['lomv'] = lomv['endtime'] - lomv['starttime']
lomv.head(5)

In [None]:
lomv['lomv'] = lomv['lomv'] / pd.Timedelta(minutes=1)
lomv.head(5)

In [None]:
lomv = lomv.drop(['endtime'], axis=1)
time = pd.DataFrame({'stay_id':basic_table['stay_id'], 'pO2_time':basic_table['pO2_time']})
print(type(time['pO2_time'].iloc[0]))

In [None]:
time.pO2_time = time.pO2_time.apply(convert_time)

In [None]:
lmo = lomv.merge(time, how='inner', on='stay_id')
lmo = lmo.drop(['starttime', 'pO2_time'], axis=1)
print(len(lmo))
lmo.head(5)

In [None]:
lmo = lmo.drop_duplicates()
print(len(lmo))
lmo.head(100)

In [None]:
print(lmo['stay_id'].nunique())

In [None]:
lmo['lomv'] = lmo.groupby('stay_id')['lomv'].transform('sum')

In [None]:
print(len(lmo))
lmo.head(100)

In [None]:
lmo = lmo.drop_duplicates()
lmo = lmo.reset_index()

In [None]:
print(len(lmo))
lmo.head(100)

In [None]:
lmo = lmo.drop(['index'], axis=1)

In [None]:
outcomes = mort.merge(lmo, how='left', on='stay_id')
print(len(outcomes))
outcomes.head(100)

In [None]:
outcomes = outcomes.drop(['subject_id'], axis=1)


In [None]:
outcomes = outcomes.sort_values(by=['label'])

In [None]:
outcomes['lomv_days'] = (outcomes['lomv']/60) /24

In [None]:
# outcomes.to_excel('Downloads/outcomes.xlsx')

In [None]:
print(diagnoses['hadm_id'].nunique())
print(len(diagnoses))

In [None]:
demo2 = basic_table.merge(icu_detail, how='left', on='stay_id')
demo2.head(5)

In [None]:
print(len(demo2))

In [None]:
demo2.columns.values

In [None]:
gendertab = pd.crosstab(demo2['label'], demo2['gender'])
racetab = pd.crosstab(demo2['label'], demo2['race'])
morttab = pd.crosstab(demo2['label'], demo2['hospital_expire_flag'])

In [None]:
gendertab.head(7)

In [None]:
racetab.head(7)

In [None]:
morttab.head(7)

In [None]:
diag = basic_table.merge(diagnoses, how='left', on='hadm_id')
diag = diag.drop(['Unnamed: 0'], axis=1)

diag.head(50)

In [None]:
mel_count=diag['long_title'].str.contains('sepsis').sum()
if mel_count>0:
    print ("There are {m} Mels".format(m=mel_count))

In [None]:
pneumonia = pd.DataFrame({'dicom':diag['dicom'], 'label':diag['label'], 'long_title': diag['long_title']})
pneumonia = pneumonia.where(pneumonia['long_title'].str.contains('eart failure'))
pneumonia = pneumonia.where(pneumonia['long_title'].str.contains('cute'))
pneumonia = pneumonia.dropna()
pneumonia = pneumonia.drop_duplicates(subset='dicom')
print(len(pneumonia))

In [None]:
print(pneumonia['label'].value_counts(normalize=False))

In [None]:
pneumonia['long_title'].head(20)

In [None]:
# demo2.to_excel('Downloads/icu_detail.xlsx')
# diag.to_excel('Downloads/diag.xlsx')