Notebook to ascertain whether we can incorporate procedures into the context data for generating text.

In [30]:
import pandas as pd
import numpy as np
import os
import psycopg2
import sqlalchemy
import string
from dateutil import parser

In [38]:
dbschema='public'

#DEMO data
# cnx = sqlalchemy.create_engine('postgresql+psycopg2://btvdksxhyqtmjs:74db647f02a6f661c9a9ef888e24cebe9b17cfd2cc7566c51556096ec9977964\
# @ec2-54-211-255-161.compute-1.amazonaws.com/dc8973qq7atsb1', connect_args={'options': '-csearch_path={}'.format(dbschema)})

cnx = sqlalchemy.create_engine('postgresql+psycopg2://njolzisalnylqx:31b9211af0689fdc75c296546539fd3d86b8c8135d277924a0659bcfe735adb3\
@ec2-3-219-135-162.compute-1.amazonaws.com/d6di1ovvoadbog', connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [39]:
# procedures data
procedure_ICD_DF = pd.read_sql_query('''
  SELECT "SUBJECT_ID", "HADM_ID", "SEQ_NUM", "ICD9_CODE"
  FROM "PROCEDURES_ICD"
''', cnx)

D_procedure_ICD_DF = pd.read_sql_query('''
  SELECT "ICD9_CODE", "SHORT_TITLE", "LONG_TITLE"
  FROM "D_ICD_PROCEDURES"
''', cnx)

procedure_DF_dup= pd.merge(procedure_ICD_DF, D_procedure_ICD_DF, on=['ICD9_CODE'])
procedure_DF_dup = procedure_DF_dup.sort_values(by=['SUBJECT_ID', 'SEQ_NUM'])
# procedure_DF_dup = procedure_DF_dup.iloc[:10000,:]
procedure_DF_dup

Unnamed: 0,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE,SHORT_TITLE,LONG_TITLE
164656,2,163353,1,9955,Vaccination NEC,Prophylactic administration of vaccine against...
85187,3,145834,1,9604,Insert endotracheal tube,Insertion of endotracheal tube
108800,3,145834,2,9962,Heart countershock NEC,Other electric countershock of heart
170768,3,145834,3,8964,Pulmon art wedge monitor,Pulmonary artery wedge monitoring
26725,3,145834,4,9672,Cont inv mec ven 96+ hrs,Continuous invasive mechanical ventilation for...
...,...,...,...,...,...,...
141679,99999,113369,1,8108,Lumb/lmbsac fus ant/post,Lumbar and lumbosacral fusion of the anterior ...
175368,99999,113369,2,8051,Excision intervert disc,Excision of intervertebral disc
223902,99999,113369,3,8162,Fus/refus 2-3 vertebrae,Fusion or refusion of 2-3 vertebrae
242539,99999,113369,4,9979,Other therapeu apheresis,Other therapeutic apheresis


In [40]:
procedure_count = procedure_ICD_DF.groupby(['SUBJECT_ID','HADM_ID'])['SEQ_NUM'].count().to_frame()
procedure_count.columns = ["COUNT"]
procedure_count = procedure_count.sort_values(by=['COUNT'])
procedure_count

Unnamed: 0_level_0,Unnamed: 1_level_0,COUNT
SUBJECT_ID,HADM_ID,Unnamed: 2_level_1
2,163353,1
27991,116541,1
27990,138903,1
27989,164239,1
8706,124790,1
...,...,...
27755,155889,38
24810,143994,40
62795,173748,40
29467,194819,40


In [41]:
np.mean(procedure_count['COUNT'])

4.595735313821947

We note that each patient has around 4.59 procedures when admitted to the hospital

In [51]:
ne_table = pd.read_sql_query('''
            SELECT "SUBJECT_ID", "HADM_ID",MAX("CHARTDATE") AS "CHARTDATE"
            FROM "NOTEEVENTS" 
            WHERE "CATEGORY" = 'Discharge summary'
            GROUP BY "SUBJECT_ID", "HADM_ID"
            ''',cnx)

adm_table = pd.read_sql_query('''
            SELECT "SUBJECT_ID", "HADM_ID", MAX("DISCHTIME") AS "DISCHTIME" 
            FROM "ADMISSIONS"
            GROUP BY "SUBJECT_ID", "HADM_ID"
            ''',cnx)

ne_adm = pd.merge(ne_table, adm_table, on=["SUBJECT_ID", "HADM_ID"])

In [53]:
ne_adm["CHARTDATE"] = ne_adm["CHARTDATE"].apply(lambda x: parser.parse(x))
ne_adm["DISCHTIME"] = ne_adm["DISCHTIME"].apply(lambda x: parser.parse(x))

ne_adm["DIFF"]  = (ne_adm["DISCHTIME"] - ne_adm["CHARTDATE"]).dt.days.astype(int)

ne_adm = ne_adm.sort_values(by=['DIFF'],ascending=False)
ne_adm

Unnamed: 0,SUBJECT_ID,HADM_ID,CHARTDATE,DISCHTIME,DIFF
6408,6145,163196.0,2110-06-13,2110-09-02 15:35:00,81
16166,15482,178068.0,2125-01-12,2125-02-20 13:45:00,39
23524,22560,137561.0,2164-04-10,2164-05-18 19:00:00,38
7030,6764,138049.0,2159-01-30,2159-03-08 14:00:00,37
27638,26446,150657.0,2152-04-07,2152-05-08 12:00:00,31
...,...,...,...,...,...
5721,5525,133906.0,2116-06-27,2116-06-26 13:40:00,-1
7671,7362,173044.0,2187-01-09,2187-01-08 16:30:00,-1
12750,12121,121131.0,2160-10-11,2160-10-10 14:04:00,-1
41353,62933,161258.0,2166-02-17,2166-02-16 21:44:00,-1


In [54]:
ne_adm.loc[ne_adm['DIFF'] < 0, 'DIFF'] = 0

In [56]:
np.mean(ne_adm['DIFF'])

0.029435193263285666

We note that the patients do not stay for long after they are discharged from the hospital ICU. We further will try to decrease this average stay by removing the children below the age of 18. We believe this will also remove the skewness from our data

In [80]:
patients_table = pd.read_sql_query('''
            SELECT DISTINCT "SUBJECT_ID", "DOB", "DOD"
            FROM "PATIENTS"
            ''',cnx)

patients_ne = pd.merge(patients_table,ne_table, on=["SUBJECT_ID"])
patients_ne

Unnamed: 0,SUBJECT_ID,DOB,DOD,HADM_ID,CHARTDATE
0,97316,2083-05-18 00:00:00,2171-10-30 00:00:00,169543.0,2171-09-17
1,10281,2107-08-03 00:00:00,,184968.0,2170-06-05
2,10484,2151-10-24 00:00:00,2190-09-24 00:00:00,113233.0,2190-09-24
3,97345,2112-06-21 00:00:00,,186013.0,2138-08-29
4,65358,2021-01-14 00:00:00,,190204.0,2102-03-28
...,...,...,...,...,...
52721,96491,2129-09-27 00:00:00,,118054.0,2174-01-12
52722,11570,2068-05-19 00:00:00,,158118.0,2146-01-15
52723,13464,2147-09-05 00:00:00,,159360.0,2201-05-28
52724,25066,2137-03-20 00:00:00,,113064.0,2191-01-22


In [81]:
patients_ne["CHARTDATE"] =  pd.to_datetime(patients_ne["CHARTDATE"].apply(lambda x: parser.parse(x))).dt.date
patients_ne["DOB"] = pd.to_datetime(patients_ne["DOB"].apply(lambda x: parser.parse(x))).dt.date

patients_ne['AGE'] = patients_ne.apply(lambda e: (e['CHARTDATE'] - e['DOB']).days/365, axis=1)

patients_ne = patients_ne[patients_ne['AGE'] >= 18]
patients_ne


Unnamed: 0,SUBJECT_ID,DOB,DOD,HADM_ID,CHARTDATE,AGE
0,97316,2083-05-18,2171-10-30 00:00:00,169543.0,2171-09-17,88.391781
1,10281,2107-08-03,,184968.0,2170-06-05,62.882192
2,10484,2151-10-24,2190-09-24 00:00:00,113233.0,2190-09-24,38.945205
3,97345,2112-06-21,,186013.0,2138-08-29,26.205479
4,65358,2021-01-14,,190204.0,2102-03-28,81.252055
...,...,...,...,...,...,...
52721,96491,2129-09-27,,118054.0,2174-01-12,44.323288
52722,11570,2068-05-19,,158118.0,2146-01-15,77.709589
52723,13464,2147-09-05,,159360.0,2201-05-28,53.761644
52724,25066,2137-03-20,,113064.0,2191-01-22,53.879452


In [82]:
adults = list(patients_ne['SUBJECT_ID'])

In [86]:
ne_adm_adult = ne_adm[ne_adm['SUBJECT_ID'].isin(adults)]
print(ne_adm_adult.shape)
ne_adm_adult.head()

(48821, 5)


Unnamed: 0,SUBJECT_ID,HADM_ID,CHARTDATE,DISCHTIME,DIFF
6408,6145,163196.0,2110-06-13,2110-09-02 15:35:00,81
23524,22560,137561.0,2164-04-10,2164-05-18 19:00:00,38
27638,26446,150657.0,2152-04-07,2152-05-08 12:00:00,31
7973,7648,118565.0,2130-09-27,2130-10-24 11:42:00,27
17462,16727,157755.0,2158-09-01,2158-09-27 16:11:00,26


In [87]:
np.mean(ne_adm_adult['DIFF'])

0.017123778701788165

In [88]:
ne_adm_adult = ne_adm_adult[ne_adm_adult['DIFF'] <= 1]
print(ne_adm_adult.shape)
ne_adm_adult.head()

(48722, 5)


Unnamed: 0,SUBJECT_ID,HADM_ID,CHARTDATE,DISCHTIME,DIFF
18492,17707,102227.0,2154-03-20,2154-03-21 16:00:00,1
44112,71869,157803.0,2143-02-28,2143-03-01 06:37:00,1
5625,5428,163297.0,2143-12-19,2143-12-20 11:41:00,1
22453,21497,127399.0,2107-05-23,2107-05-24 10:00:00,1
42570,66850,104490.0,2118-03-25,2118-03-26 15:50:00,1


We will import all the required admission ids for the purpose of this project into a csv

In [89]:
ne_adm_adult[['HADM_ID']].to_csv('../data/required_hadm_ids.csv',index=False)