In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pprint
import os
from sklearn.decomposition import PCA
from sklearn.mixture import GaussianMixture
from sklearn import metrics

pd.set_option('display.max_colwidth',100000) #https://stackoverflow.com/questions/54692405/output-truncation-in-google-colab
pd.set_option('display.max_rows', 100000)

# Below imports are used to print out pretty pandas dataframes
from IPython.display import display, HTML

In [None]:
base_table = pd.read_pickle('Downloads/base_table.pkl')
stay_id = pd.read_csv('Downloads/stay_id.csv')
final_adult_patients = pd.read_csv('Downloads/final_adult_patients.csv')
height = pd.read_csv('Downloads/height.csv')
weight = pd.read_csv('Downloads/weight.csv')
comorb = pd.read_csv('Downloads/comorb.csv')
icu_detail = pd.read_csv('Downloads/icu_detail.csv')

In [None]:
from datetime import datetime
def convert_time(x):
  date_time_str = x
  if date_time_str[-1] == ':':
    date_time_str += '00'
  date_time_obj = datetime.strptime(date_time_str, '%Y-%m-%d %H:%M:%S')
  return date_time_obj

In [None]:
base_table.columns.values

In [None]:
stay_id = stay_id.drop(['Unnamed: 0'], axis=1)

In [None]:
stay_id.intime = stay_id.intime.apply(convert_time)
stay_id.outtime = stay_id.outtime.apply(convert_time)
base_table.charttime_pO2 = base_table.charttime_pO2.apply(convert_time)

In [None]:
print(type(base_table['charttime_pO2'].iloc[0]))

In [None]:
basic_table = base_table.merge(stay_id, how='inner', on='hadm_id')
print(len(basic_table))

In [None]:

basic_table['time_diff'] = basic_table.charttime_pO2 - basic_table.outtime
basic_table['time_diff'] = basic_table['time_diff'].abs()

basic_table['time_diff'] = basic_table['time_diff'] / pd.Timedelta(minutes=1)
basic_table['min_time_diff'] = basic_table.groupby('dicom')['time_diff'].transform('min')

basic_table = basic_table.where(basic_table['time_diff'] == basic_table['min_time_diff'])

In [None]:
basic_table = basic_table.dropna()

In [None]:
# basic_table = basic_table.where(basic_table['intime'] < basic_table['charttime_pO2'])
# basic_table = basic_table.dropna()
# basic_table = basic_table.reset_index()

In [None]:
print(len(basic_table))

In [None]:
basic_table.head(100)

In [None]:
basic_table = basic_table.drop(['intime','outtime','time_diff', 'min_time_diff', 'subject_id' ], axis=1)
basic_table = basic_table.reindex(columns=['subject_id_x', 'hadm_id','stay_id', 'dicom', 'PF_ratio','charttime_pO2', 'pO2_time', 'pO2_date', 'Xray_date',
                         'Xray_time', 'admittime', 'label', 'tsne_features_xs', 'tsne_features_ys', 'tsne_features', 'los'])

In [None]:
# basic_table.to_pickle('Downloads/basic_table.pkl')

In [None]:
basic_table = basic_table.drop(['los'], axis=1)
basic_table = basic_table.rename(columns={'subject_id_x':'subject_id'})
basic_table.head(5)

In [None]:
bmi = pd.read_csv('Downloads/bmi.csv')
gender_ethnicity = pd.read_csv('Downloads/gender_ethnicity.csv')

In [None]:
gender_ethnicity.columns.values
gender_ethnicity =gender_ethnicity.drop(['Unnamed: 0'], axis=1)

In [None]:
G_E = basic_table.merge(gender_ethnicity, how='inner', on='subject_id')
G_E.head(100
        )

In [None]:
G_E = G_E.drop_duplicates(subset='dicom', ignore_index=True)
print(len(G_E))

In [None]:
print(type(bmi['chartdate'].iloc[0]))
bmi =bmi.drop(['Unnamed: 0'], axis=1)
bmi = bmi.rename(columns={'result_value':'bmi'})
bmi.head(5
        )

In [None]:
bmi['chartdate'] = bmi['chartdate'].str.replace('-','')
bmi['chartdate'] = bmi['chartdate'].astype(int)

In [None]:
bmi_date = pd.DataFrame({'subject_id':G_E['subject_id'], 'pO2_date': G_E['pO2_date']})
print(type(bmi_date['pO2_date'].iloc[0]))

In [None]:
bmi_date = bmi_date.merge(bmi, how='inner', on='subject_id')
print(len(bmi_date))

In [None]:
bmi_date['time_diff'] = bmi_date['pO2_date'] - bmi_date['chartdate']
bmi_date['time_diff'] = bmi_date['time_diff'].abs()

bmi_date['min_time_diff'] = bmi_date.groupby('subject_id')['time_diff'].transform('min')

bmi_date = bmi_date.where(bmi_date['time_diff'] == bmi_date['min_time_diff'])
bmi_date = bmi_date[bmi_date['subject_id'].notna()]
bmi_date= bmi_date.drop_duplicates(subset='subject_id', ignore_index=True)

In [None]:
bmi_date =bmi_date.drop(['pO2_date', 'chartdate', 'time_diff','min_time_diff'], axis=1)
bmi_date.head(5)

In [None]:
demographics = G_E.merge(bmi_date, how='inner', on='subject_id')
demographics.head(50)

In [None]:
print(demographics['subject_id'].nunique())

In [None]:
final_adult_patients.columns.values

In [None]:
age = pd.DataFrame({'hadm_id': final_adult_patients['hadm_id_y'], 'age':final_adult_patients['age']})
gea= G_E.merge(age, how='inner', on='hadm_id')
print(len(gea))

In [None]:
gea = gea.drop_duplicates(subset='dicom', keep='last')
print(len(gea))

In [None]:
height =height.drop(['Unnamed: 0'], axis=1)
weight =weight.drop(['Unnamed: 0'], axis=1)

weight.columns.values

In [None]:
height.columns.values

In [None]:
hw = weight.merge(height, how='inner', on='stay_id')
print(len(height))
print(len(hw))

In [None]:
hw.head(100)

In [None]:
hw.starttime = hw.starttime.apply(convert_time)
hw.charttime = hw.charttime.apply(convert_time)

In [None]:
hw['time_diff'] = hw.starttime - hw.charttime
hw['time_diff'] = hw['time_diff'].abs()

hw['time_diff'] = hw['time_diff'] / pd.Timedelta(minutes=1)
hw['min_time_diff'] = hw.groupby('stay_id')['time_diff'].transform('min')

hw = hw.where(hw['time_diff'] == hw['min_time_diff'])


hw = hw[hw['stay_id'].notna()]

In [None]:
print(len(hw))

In [None]:
hw.head(5)

In [None]:
print(hw['stay_id'].nunique())

In [None]:
ti = pd.DataFrame({'stay_id':basic_table['stay_id'], 'pO2_time':basic_table['pO2_time']})
ti.pO2_time = ti.pO2_time.apply(convert_time)

In [None]:
tihw = hw.merge(ti, how='inner', on='stay_id')
print(len(tihw))

In [None]:
print(tihw['stay_id'].nunique())

In [None]:
tihw.columns.values

In [None]:
tihw =tihw.drop(['starttime','weight_type','charttime','time_diff','min_time_diff','pO2_time'], axis=1)
tihw.head(100)

In [None]:
tihw.columns.values

In [None]:
tihw = tihw.drop_duplicates()
tihw =tihw.reset_index()

In [None]:
tihw =tihw.drop(['level_0', 'index'], axis=1)
tihw.head(100)

In [None]:
tihw['height'] = tihw['height']/100
tihw['bmi'] = tihw['weight']/ (tihw['height'] * tihw['height'])
tihw.head(10)

In [None]:
geab = gea.merge(tihw, how='left', on='stay_id')
geab.head(5)

In [None]:
print(len(geab))
print(len(geab['dicom']))
print(geab['dicom'].nunique())

In [None]:
comorb = comorb.drop(['Unnamed: 0'], axis=1)
comorb.head(10)

In [None]:
print(len(comorb))
print(comorb['hadm_id'].nunique())

In [None]:
demographics = geab.merge(comorb, how='left', on='hadm_id')
print(len(demographics))
print(len(demographics['dicom']))
print(demographics['dicom'].nunique())
demographics.columns.values

In [None]:
demographics = demographics.drop(['subject_id','subject_id_y'], axis=1)

In [None]:
demographics = demographics.sort_values(by=['label'])


In [None]:
# demographics.to_excel('Downloads/demographics.xlsx')

In [None]:
print(basic_table['subject_id'].nunique())
print(basic_table['stay_id'].nunique())
print(basic_table['dicom'].nunique())

In [None]:
gender = pd.crosstab(demographics['label'], demographics['gender'])
gender.head(7)

In [None]:
ethnicity = pd.crosstab(demographics['label'], demographics['race'])
ethnicity.head(7)