Notebook to ascertain whether we can incorporate procedures into the context data for generating text.

In [8]:
import pandas as pd
import numpy as np
import os
import psycopg2
import sqlalchemy
import string

In [11]:
dbschema='public'

#DEMO data
# cnx = sqlalchemy.create_engine('postgresql+psycopg2://btvdksxhyqtmjs:74db647f02a6f661c9a9ef888e24cebe9b17cfd2cc7566c51556096ec9977964\
# @ec2-54-211-255-161.compute-1.amazonaws.com/dc8973qq7atsb1', connect_args={'options': '-csearch_path={}'.format(dbschema)})

cnx = sqlalchemy.create_engine('postgresql+psycopg2://njolzisalnylqx:31b9211af0689fdc75c296546539fd3d86b8c8135d277924a0659bcfe735adb3\
@ec2-3-219-135-162.compute-1.amazonaws.com/d6di1ovvoadbog', connect_args={'options': '-csearch_path={}'.format(dbschema)})

In [12]:
# procedures data

df_procedures = pd.read_sql_query('''
  SELECT p.subject_id, p.hadm_id, p.seq_num, p.icd9_code, icd.short_title, icd.long_title
  FROM "PROCEDURES_ICD" p
  INNER JOIN "D_ICD_PROCEDURES" icd 
  USING (icd9_code)
  ORDER BY p.subject_id, p.seq_num
  --LIMIT 10000;
''', cnx)

print(df_procedures.shape)
df_procedures.head(20)

(516, 6)


Unnamed: 0,subject_id,hadm_id,seq_num,icd9_code,short_title,long_title
0,10006,142345,1,9749,Remov thor ther dev NEC,Removal of other device from thorax
1,10006,142345,2,5491,Percu abdominal drainage,Percutaneous abdominal drainage
2,10006,142345,3,3895,Ven cath renal dialysis,Venous catheterization for renal dialysis
3,10006,142345,4,3995,Hemodialysis,Hemodialysis
4,10006,142345,5,3893,Venous cath NEC,"Venous catheterization, not elsewhere classified"
5,10006,142345,6,9907,Serum transfusion NEC,Transfusion of other serum
6,10006,142345,7,14,Injection oxazolidinone,Injection or infusion of oxazolidinone class o...
7,10011,105331,1,9915,Parent infus nutrit sub,Parenteral infusion of concentrated nutritiona...
8,10011,105331,2,3893,Venous cath NEC,"Venous catheterization, not elsewhere classified"
9,10013,165520,1,3891,Arterial catheterization,Arterial catheterization


In [13]:
# procedures data

df_proc_sum = pd.read_sql_query('''
  SELECT subject_id, hadm_id, COUNT(seq_num)
  FROM "PROCEDURES_ICD" p
  INNER JOIN "D_ICD_PROCEDURES" icd 
  USING (icd9_code)
  GROUP BY subject_id, hadm_id
  ORDER BY COUNT(seq_num) DESC
  --LIMIT 10000;
''', cnx)

print(df_proc_sum.shape)
df_proc_sum.head(20)

(113, 3)


Unnamed: 0,subject_id,hadm_id,count
0,10126,160445,23
1,10127,182839,20
2,42075,151323,18
3,40310,186361,14
4,10045,126949,14
5,40595,116518,13
6,10027,199395,12
7,42281,195911,12
8,42367,139932,11
9,41914,101361,11


In [14]:
np.mean(df_proc_sum['count'])

4.566371681415929

Each subject has on average 4.5 procedures during a hospital admission

In [18]:
df_adm = pd.read_sql_query('''
  SELECT n.subject_id, n.hadm_id, MAX(n.chartdate) AS chartdate, MAX(a.dischtime) AS dischtime,
  (cast(MAX(a.dischtime) as date) - cast(MAX(n.chartdate) as date)) AS diff
  FROM "NOTEEVENTS2" n
  JOIN "ADMISSIONS" a
  ON n.subject_id = a.subject_id AND n.hadm_id = a.hadm_id 
  WHERE n.category = 'Discharge summary'
  GROUP BY n.subject_id, n.hadm_id
  ORDER BY diff DESC
  --LIMIT 10000;
''', cnx)

print(df_adm.shape)
df_adm.head(20)

(118, 5)


Unnamed: 0,subject_id,hadm_id,chartdate,dischtime,diff
0,10006,142345.0,2164-11-01 00:00:00,2164-11-01 17:15:00,0
1,10011,105331.0,2126-08-28 00:00:00,2126-08-28 18:59:00,0
2,10013,165520.0,2125-10-07 00:00:00,2125-10-07 15:13:00,0
3,10017,199207.0,2149-06-03 00:00:00,2149-06-03 18:42:00,0
4,10019,177759.0,2163-05-15 00:00:00,2163-05-15 12:00:00,0
5,10026,103770.0,2195-05-24 00:00:00,2195-05-24 11:45:00,0
6,10027,199395.0,2190-07-25 00:00:00,2190-07-25 14:00:00,0
7,10029,132349.0,2139-10-02 00:00:00,2139-10-02 14:29:00,0
8,10032,140372.0,2138-04-15 00:00:00,2138-04-15 14:35:00,0
9,10033,157235.0,2132-12-08 00:00:00,2132-12-08 15:15:00,0


In [19]:
df_adm.loc[df_adm['diff'] < 0, 'diff'] = 0

In [20]:
np.mean(df_adm['diff'])

0.0

Clearly there are a lot of patients who stay for a long time in hospital after being discharged from the ICU. However the average stay post ICU discharge is only 0.03 days. Meaning almost all patients leave hospital after being discharged from the ICU. What about if we discount patients who died in ICU.

In [22]:
df_death = pd.read_sql_query('''
  SELECT subject_id, dod
  FROM "PATIENTS"
  ORDER BY subject_id
  --LIMIT 10000;
''', cnx)

print(df_death.shape)
df_death.head()

(100, 2)


Unnamed: 0,subject_id,dod
0,10006,2165-08-12 00:00:00
1,10011,2126-08-28 00:00:00
2,10013,2125-10-07 00:00:00
3,10017,2152-09-12 00:00:00
4,10019,2163-05-15 00:00:00


In [23]:
df_temp = pd.merge(df_adm, df_death,  how='left', left_on=['subject_id'], right_on = ['subject_id'])
df_temp.head()

Unnamed: 0,subject_id,hadm_id,chartdate,dischtime,diff,dod
0,10006,142345.0,2164-11-01 00:00:00,2164-11-01 17:15:00,0,2165-08-12 00:00:00
1,10011,105331.0,2126-08-28 00:00:00,2126-08-28 18:59:00,0,2126-08-28 00:00:00
2,10013,165520.0,2125-10-07 00:00:00,2125-10-07 15:13:00,0,2125-10-07 00:00:00
3,10017,199207.0,2149-06-03 00:00:00,2149-06-03 18:42:00,0,2152-09-12 00:00:00
4,10019,177759.0,2163-05-15 00:00:00,2163-05-15 12:00:00,0,2163-05-15 00:00:00


In [24]:
df_temp['dead'] = np.where(df_temp['dod'] <= df_temp['chartdate'], 1, 0)

In [25]:
df_temp2 = df_temp[df_temp['dead'] == 0]
df_temp2.shape

(87, 7)

In [26]:
np.mean(df_temp2['diff'])

0.0

It turns out this only barely makes a difference. Still, most patients are discharged immediately after their ICU stay. I have now realised that we are including neonates in this. Let's remove them to look at only adults.

In [27]:
sql = """
  SELECT DISTINCT p.subject_id
  FROM "PATIENTS" p 
  INNER JOIN "NOTEEVENTS2" n 
  ON p.subject_id = n.subject_id
  WHERE ROUND((cast(chartdate as date) - cast(dob as date)) / 365.242,0) > 14
  AND n.category = 'Discharge summary'
  ORDER BY subject_id
  --LIMIT 100;
"""

df = pd.read_sql_query(sqlalchemy.text(sql), cnx)
adults = list(df['subject_id'])

In [28]:
df_temp3 = df_temp[df_temp['subject_id'].isin(adults)]
print(df_temp3.shape)
df_temp3.head()

(118, 7)


Unnamed: 0,subject_id,hadm_id,chartdate,dischtime,diff,dod,dead
0,10006,142345.0,2164-11-01 00:00:00,2164-11-01 17:15:00,0,2165-08-12 00:00:00,0
1,10011,105331.0,2126-08-28 00:00:00,2126-08-28 18:59:00,0,2126-08-28 00:00:00,1
2,10013,165520.0,2125-10-07 00:00:00,2125-10-07 15:13:00,0,2125-10-07 00:00:00,1
3,10017,199207.0,2149-06-03 00:00:00,2149-06-03 18:42:00,0,2152-09-12 00:00:00,0
4,10019,177759.0,2163-05-15 00:00:00,2163-05-15 12:00:00,0,2163-05-15 00:00:00,1


In [29]:
np.mean(df_temp3['diff'])

0.0

As predicted, the average stay post ICU discharge drops even further after removing neonates. Let's take a closer look at some of these patients who are having super long stays in hospital

In [30]:
df_procedures[df_procedures['hadm_id'] == 150657]

Unnamed: 0,subject_id,hadm_id,seq_num,icd9_code,short_title,long_title


There aren't that many procedures, but still to be safe, we can only include procedures for patients who leave hopsital within a day of discharge.

In [31]:
df_final = df_temp3[df_temp3['diff'] <= 1]
print(df_final.shape)
df_final.head()

(118, 7)


Unnamed: 0,subject_id,hadm_id,chartdate,dischtime,diff,dod,dead
0,10006,142345.0,2164-11-01 00:00:00,2164-11-01 17:15:00,0,2165-08-12 00:00:00,0
1,10011,105331.0,2126-08-28 00:00:00,2126-08-28 18:59:00,0,2126-08-28 00:00:00,1
2,10013,165520.0,2125-10-07 00:00:00,2125-10-07 15:13:00,0,2125-10-07 00:00:00,1
3,10017,199207.0,2149-06-03 00:00:00,2149-06-03 18:42:00,0,2152-09-12 00:00:00,0
4,10019,177759.0,2163-05-15 00:00:00,2163-05-15 12:00:00,0,2163-05-15 00:00:00,1


In [32]:
df_final.hadm_id.unique().shape[0]

118

The total number of unique hospital admission ids in our discharge summary dataset is 48,902. So we are only losing out on 99 discharge summaries for context data - not the end of the world by any means. Let's export the hospital admission ids to use in our preprocessing. However, we'll still only use it only for the last discharge note of an admission - just to be safe

In [34]:
df_final[['hadm_id']].to_csv('safal_data/df_proc_hadm_ids.csv',index=False)