In [1]:
import pandas as pd
import re
import transformers as ppb
import numpy as np
from scipy import sparse

In [2]:
DATA_DIR = "./data"
admissions_file = DATA_DIR + '/ADMISSIONS.csv'
diagnoses_file = DATA_DIR + '/DIAGNOSES_ICD.csv'

First use admissions records to create READMIT label and STAY_LENGTH

In [7]:
full_admissions = pd.read_csv(admissions_file)
print(full_admissions.shape)
print(full_admissions.dtypes)
full_admissions.head()

(58976, 19)
ROW_ID                   int64
SUBJECT_ID               int64
HADM_ID                  int64
ADMITTIME               object
DISCHTIME               object
DEATHTIME               object
ADMISSION_TYPE          object
ADMISSION_LOCATION      object
DISCHARGE_LOCATION      object
INSURANCE               object
LANGUAGE                object
RELIGION                object
MARITAL_STATUS          object
ETHNICITY               object
EDREGTIME               object
EDOUTTIME               object
DIAGNOSIS               object
HOSPITAL_EXPIRE_FLAG     int64
HAS_CHARTEVENTS_DATA     int64
dtype: object


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [8]:
# convert all to datetime
full_admissions['ADMITTIME'] = pd.to_datetime(full_admissions['ADMITTIME'])
full_admissions['DISCHTIME'] = pd.to_datetime(full_admissions['DISCHTIME'])
full_admissions['DEATHTIME'] = pd.to_datetime(full_admissions['DEATHTIME'])
full_admissions.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,ADMISSION_LOCATION,DISCHARGE_LOCATION,INSURANCE,LANGUAGE,RELIGION,MARITAL_STATUS,ETHNICITY,EDREGTIME,EDOUTTIME,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
0,21,22,165315,2196-04-09 12:26:00,2196-04-10 15:54:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,DISC-TRAN CANCER/CHLDRN H,Private,,UNOBTAINABLE,MARRIED,WHITE,2196-04-09 10:06:00,2196-04-09 13:24:00,BENZODIAZEPINE OVERDOSE,0,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,PHYS REFERRAL/NORMAL DELI,HOME HEALTH CARE,Medicare,,CATHOLIC,MARRIED,WHITE,,,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1
2,23,23,124321,2157-10-18 19:34:00,2157-10-25 14:00:00,NaT,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME HEALTH CARE,Medicare,ENGL,CATHOLIC,MARRIED,WHITE,,,BRAIN MASS,0,1
3,24,24,161859,2139-06-06 16:14:00,2139-06-09 12:48:00,NaT,EMERGENCY,TRANSFER FROM HOSP/EXTRAM,HOME,Private,,PROTESTANT QUAKER,SINGLE,WHITE,,,INTERIOR MYOCARDIAL INFARCTION,0,1
4,25,25,129635,2160-11-02 02:06:00,2160-11-05 14:55:00,NaT,EMERGENCY,EMERGENCY ROOM ADMIT,HOME,Private,,UNOBTAINABLE,MARRIED,WHITE,2160-11-02 01:01:00,2160-11-02 04:27:00,ACUTE CORONARY SYNDROME,0,1


In [9]:
# order by subject and discharge 
admits_patient_df = full_admissions.sort_values(by=['SUBJECT_ID','DISCHTIME']).loc[:,('ROW_ID','SUBJECT_ID','HADM_ID','ADMITTIME',\
    'DISCHTIME','DEATHTIME','ADMISSION_TYPE','DIAGNOSIS','HOSPITAL_EXPIRE_FLAG','HAS_CHARTEVENTS_DATA')]
admits_patient_df.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
211,1,2,163353,2138-07-17 19:04:00,2138-07-21 15:48:00,NaT,NEWBORN,NEWBORN,0,1
212,2,3,145834,2101-10-20 19:08:00,2101-10-31 13:58:00,NaT,EMERGENCY,HYPOTENSION,0,1
213,3,4,185777,2191-03-16 00:28:00,2191-03-23 18:41:00,NaT,EMERGENCY,"FEVER,DEHYDRATION,FAILURE TO THRIVE",0,1
214,4,5,178980,2103-02-02 04:31:00,2103-02-04 12:15:00,NaT,NEWBORN,NEWBORN,0,1
215,5,6,107064,2175-05-30 07:15:00,2175-06-15 16:00:00,NaT,ELECTIVE,CHRONIC RENAL FAILURE/SDA,0,1


In [10]:
# separate only patients with more than one admission to create READMIT flag
mult_admits = admits_patient_df[admits_patient_df.duplicated(subset=['SUBJECT_ID'],keep=False)]
print(mult_admits.shape)
mult_admits.head()

(19993, 10)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA
224,14,17,194023,2134-12-27 07:15:00,2134-12-31 16:05:00,NaT,ELECTIVE,PATIENT FORAMEN OVALE\ PATENT FORAMEN OVALE MI...,0,1
225,15,17,161087,2135-05-09 14:11:00,2135-05-13 14:40:00,NaT,EMERGENCY,PERICARDIAL EFFUSION,0,1
229,19,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,NaT,EMERGENCY,CONGESTIVE HEART FAILURE,0,1
230,20,21,111970,2135-01-30 20:50:00,2135-02-08 02:08:00,2135-02-08 02:08:00,EMERGENCY,SEPSIS,1,1
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1


In [84]:
# showing what the rolling function does on a single example with 3 admissions
mult_admits[mult_admits['SUBJECT_ID']==36].loc[:,('DISCHTIME','READMIT')].rolling(window='30D', on='DISCHTIME',closed='left').sum()

Unnamed: 0,DISCHTIME,READMIT
15,2131-05-08 14:00:00,
16,2131-05-25 13:30:00,1.0
17,2134-05-20 13:16:00,


In [11]:
# identify readmits within 30 days
subjects = mult_admits['SUBJECT_ID'].unique()
mult_admits.loc[:,'READMIT_2'] = 1

# assign flag for each patient if they were re-admitted
for subject in subjects:
    readmitted = mult_admits[mult_admits['SUBJECT_ID'] == subject].loc[:,('DISCHTIME','READMIT_2')].rolling(window='30D', on='DISCHTIME',closed='left').sum()
    #print(readmitted)
    mult_admits.loc[readmitted.index, 'READMIT_2'] = readmitted['READMIT_2']
    #print(subject)

# assign READMIT flag to the occurrence PRIOR to the readmit
mult_admits['READMIT'] = mult_admits['READMIT_2'].shift(-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = value
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_column(ilocs[0], value, pi)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mult_admits['READMIT'] = mult_admits['READMIT_2'].shift(-1)


In [109]:
# confirm READMIT working as expected
mult_admits[mult_admits['SUBJECT_ID'] == 36]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,READMIT_2,READMIT
15,36,36,182104,2131-04-30 07:15:00,2131-05-08 14:00:00,NaT,EMERGENCY,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,,1.0
16,37,36,122659,2131-05-12 19:49:00,2131-05-25 13:30:00,NaT,EMERGENCY,CHEST PAIN/SHORTNESS OF BREATH,0,1,1.0,
17,38,36,165660,2134-05-10 11:30:00,2134-05-20 13:16:00,NaT,ELECTIVE,VENTRAL HERNIA/SDA,0,1,,


In [12]:
single_admits = admits_patient_df[~admits_patient_df.duplicated(subset=['SUBJECT_ID'],keep=False)]
print(single_admits.shape)
print('total rows in single and mult admits tables:',mult_admits.shape[0] + single_admits.shape[0])

(38983, 10)
total rows in single and mult admits tables: 58976


In [13]:
# concatenate individuals with multiple admissions and single admissions back to full dataset
full_admits_labels = pd.concat([mult_admits, single_admits])
full_admits_labels['READMIT'] = full_admits_labels['READMIT'].fillna(0)
full_admits_labels['READMIT'] = np.where(full_admits_labels['READMIT'] > 0, 1, 0)
full_admits_labels.drop(columns=['READMIT_2'],inplace=True)
print(full_admits_labels.shape)
full_admits_labels.head()

(58976, 11)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,READMIT
224,14,17,194023,2134-12-27 07:15:00,2134-12-31 16:05:00,NaT,ELECTIVE,PATIENT FORAMEN OVALE\ PATENT FORAMEN OVALE MI...,0,1,0
225,15,17,161087,2135-05-09 14:11:00,2135-05-13 14:40:00,NaT,EMERGENCY,PERICARDIAL EFFUSION,0,1,0
229,19,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,NaT,EMERGENCY,CONGESTIVE HEART FAILURE,0,1,0
230,20,21,111970,2135-01-30 20:50:00,2135-02-08 02:08:00,2135-02-08 02:08:00,EMERGENCY,SEPSIS,1,1,0
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,0


Next, create Stay Length both as a timedelta and as an integer of seconds

In [24]:
full_admits_labels.loc[:,('STAY_LENGTH_TD')] = full_admits_labels.loc[:,('DISCHTIME')] - full_admits_labels.loc[:,('ADMITTIME')]
full_admits_labels.loc[:,('STAY_LENGTH_SEC')] = full_admits_labels.loc[:,('STAY_LENGTH_TD')].dt.total_seconds()
full_admits_labels.loc[:,('STAY_LENGTH_SEC')] = np.where(full_admits_labels.loc[:,('STAY_LENGTH_SEC')] < 1, None, full_admits_labels.loc[:,('STAY_LENGTH_SEC')])
# impute negative values (98 records) with mean
full_admits_labels.loc[:,('STAY_LENGTH_SEC')].fillna(full_admits_labels.loc[:,('STAY_LENGTH_SEC')].mean(),inplace=True)
full_admits_labels.head()

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,READMIT,STAY_LENGTH_TD,STAY_LENGTH_SEC
224,14,17,194023,2134-12-27 07:15:00,2134-12-31 16:05:00,NaT,ELECTIVE,PATIENT FORAMEN OVALE\ PATENT FORAMEN OVALE MI...,0,1,0,4 days 08:50:00,377400.0
225,15,17,161087,2135-05-09 14:11:00,2135-05-13 14:40:00,NaT,EMERGENCY,PERICARDIAL EFFUSION,0,1,0,4 days 00:29:00,347340.0
229,19,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,NaT,EMERGENCY,CONGESTIVE HEART FAILURE,0,1,0,13 days 03:58:00,1137480.0
230,20,21,111970,2135-01-30 20:50:00,2135-02-08 02:08:00,2135-02-08 02:08:00,EMERGENCY,SEPSIS,1,1,0,8 days 05:18:00,710280.0
1,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,0,5 days 11:55:00,474900.0


In [25]:
full_admits_labels[full_admits_labels['STAY_LENGTH_SEC'].isna()]

Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,READMIT,STAY_LENGTH_TD,STAY_LENGTH_SEC


Finally, attach diagnosis ICD-9 code from DIAGNOSES_ICD table

In [26]:
diagnoses = pd.read_csv(diagnoses_file)
print(diagnoses.shape)
print(diagnoses.dtypes)
diagnoses.head()

(651047, 5)
ROW_ID          int64
SUBJECT_ID      int64
HADM_ID         int64
SEQ_NUM       float64
ICD9_CODE      object
dtype: object


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,SEQ_NUM,ICD9_CODE
0,1297,109,172335,1.0,40301
1,1298,109,172335,2.0,486
2,1299,109,172335,3.0,58281
3,1300,109,172335,4.0,5855
4,1301,109,172335,5.0,4254


In [137]:
diagnoses['ICD9_CODE'].sort_values()

304974    0030
505370    0030
519876    0031
594655    0038
414953    0038
          ... 
333526     NaN
356738     NaN
360813     NaN
386777     NaN
549256     NaN
Name: ICD9_CODE, Length: 651047, dtype: object

In [27]:
# only select the top priority ICD diagnosis to attempt to predict
diagnoses_priority = diagnoses[diagnoses['SEQ_NUM'] == 1]
diagnoses_priority.drop(columns=['ROW_ID','SEQ_NUM'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [28]:
full_labels = full_admits_labels.merge(diagnoses_priority, on=['SUBJECT_ID','HADM_ID'])
print(full_labels.shape)
full_labels.head()

(58929, 14)


Unnamed: 0,ROW_ID,SUBJECT_ID,HADM_ID,ADMITTIME,DISCHTIME,DEATHTIME,ADMISSION_TYPE,DIAGNOSIS,HOSPITAL_EXPIRE_FLAG,HAS_CHARTEVENTS_DATA,READMIT,STAY_LENGTH_TD,STAY_LENGTH_SEC,ICD9_CODE
0,14,17,194023,2134-12-27 07:15:00,2134-12-31 16:05:00,NaT,ELECTIVE,PATIENT FORAMEN OVALE\ PATENT FORAMEN OVALE MI...,0,1,0,4 days 08:50:00,377400.0,7455
1,15,17,161087,2135-05-09 14:11:00,2135-05-13 14:40:00,NaT,EMERGENCY,PERICARDIAL EFFUSION,0,1,0,4 days 00:29:00,347340.0,4239
2,19,21,109451,2134-09-11 12:17:00,2134-09-24 16:15:00,NaT,EMERGENCY,CONGESTIVE HEART FAILURE,0,1,0,13 days 03:58:00,1137480.0,41071
3,20,21,111970,2135-01-30 20:50:00,2135-02-08 02:08:00,2135-02-08 02:08:00,EMERGENCY,SEPSIS,1,1,0,8 days 05:18:00,710280.0,388
4,22,23,152223,2153-09-03 07:15:00,2153-09-08 19:10:00,NaT,ELECTIVE,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...,0,1,0,5 days 11:55:00,474900.0,41401


In [29]:
labels_final = full_labels.loc[:,('SUBJECT_ID','HADM_ID','READMIT','STAY_LENGTH_SEC','ICD9_CODE')]
labels_final.to_pickle('./data/labels_final_df.pkl')

Finally finally, create 3 arrays of labels to feed into models

In [141]:
# readmit array
readmit_np = np.array(labels_final['READMIT'])
sparse_readmit_labels = sparse.csr_matrix(readmit_np)
readmit_labels_file = DATA_DIR + '/sparse_readmit_labels'
sparse.save_npz(readmit_labels_file, sparse_readmit_labels)

In [142]:
# stay length array
staylen_np = np.array(labels_final['STAY_LENGTH_SEC'])
sparse_staylen_labels = sparse.csr_matrix(staylen_np)
staylen_labels_file = DATA_DIR + '/sparse_staylen_labels'
sparse.save_npz(staylen_labels_file, sparse_staylen_labels)

In [None]:
# diagnoses array - CURRENTLY DOESN'T WORK B/C ICD-9 CODES NOT ALL NUMERIC!
#diagnoses_np = np.array(labels_final['ICD9_CODE'])
#sparse_diagnoses_labels = sparse.csr_matrix(diagnoses_np)
#diagnoses_labels_file = DATA_DIR + '/sparse_diagnoses_labels'
#sparse.save_npz(diagnoses_labels_file, sparse_diagnoses_labels)

In [3]:
best_note_df = pd.read_pickle('./data/best_note_df.pkl')

In [155]:
notes_labels = best_note_df.merge(labels_final, on=['SUBJECT_ID','HADM_ID'])
notes_labels.head()

Unnamed: 0,SUBJECT_ID,HADM_ID,TEXT,IS_DISCHARGE,IS_NURSING,IS_OTHER,READMIT,STAY_LENGTH_SEC,ICD9_CODE
0,22532,167853,['Admission Date: [**2151-7-16**] Disch...,1,0,0,0,16860,1193
1,13702,107527,['Admission Date: [**2118-6-2**] Discha...,1,0,0,0,71820,5191
2,13702,167118,['Admission Date: [**2119-5-4**] ...,1,0,0,0,22500,5191
3,13702,196489,"[""Admission Date: [**2124-7-21**] ...",1,0,0,0,1020,51884
4,26880,135453,['Admission Date: [**2162-3-3**] ...,1,0,0,0,70140,80506


In [156]:
readmit_np = np.array(notes_labels['READMIT'])
sparse_readmit_labels = sparse.csr_matrix(readmit_np)
readmit_labels_file = DATA_DIR + '/sparse_readmit_labels'
sparse.save_npz(readmit_labels_file, sparse_readmit_labels)

In [157]:
staylen_np = np.array(notes_labels['STAY_LENGTH_SEC'])
sparse_staylen_labels = sparse.csr_matrix(staylen_np)
staylen_labels_file = DATA_DIR + '/sparse_staylen_labels'
sparse.save_npz(staylen_labels_file, sparse_staylen_labels)

In [158]:
notes_labels.to_pickle('./data/notes_labels.pkl')

Creating label and feature files from los_read tables

In [52]:
train_roberta_x = pd.read_pickle('./data/roberta_los_read/roberta_los_read_x_train.pkl')
train_roberta_y = pd.read_pickle('./data/roberta_los_read/roberta_los_read_y_train.pkl')
val_roberta_x = pd.read_pickle('./data/roberta_los_read/roberta_los_read_x_val.pkl')
val_roberta_y = pd.read_pickle('./data/roberta_los_read/roberta_los_read_y_val.pkl')
test_roberta_x = pd.read_pickle('./data/roberta_los_read/roberta_los_read_x_test.pkl')
test_roberta_y = pd.read_pickle('./data/roberta_los_read/roberta_los_read_y_test.pkl')

In [196]:
train_roberta_x.select_dtypes('object')

Unnamed: 0,roberta,icd9_code,diagnosis
6249,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",27801,MORBID OBESITY/SDA
38563,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",8208,UPPER GI BLEED
36134,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",51881,RESPIRATORY ARREST;TELEMETRY
53051,"[0, 48759, 10980, 4, 646, 12606, 47186, 94, 13...",9351,FOREIGN BODY ESOPHAGUS
21180,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",4329,INTRACRANIAL HEMORRHAGE
...,...,...,...
34058,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",51884,PNEUMONIA
46834,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",99931,FEVER
17909,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",V3000,NEWBORN
42301,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",41401,CORONARY ARTERY DISEASE\CORONARY ARTERY BYPASS...


In [161]:
print(train_roberta_x.columns)
train_roberta_x.head()

Index(['subject_id', 'hadm_id', 'is_discharge', 'is_nursing', 'is_other',
       'roberta', 'icd9_code', 'unnamed: 0', 'tsurg', 'med',
       ...
       'ethnicity_patient declined to answer', 'ethnicity_portuguese',
       'ethnicity_south american', 'ethnicity_unable to obtain',
       'ethnicity_unknown/not specified', 'ethnicity_white',
       'ethnicity_white - brazilian', 'ethnicity_white - eastern european',
       'ethnicity_white - other european', 'ethnicity_white - russian'],
      dtype='object', length=258)


Unnamed: 0,subject_id,hadm_id,is_discharge,is_nursing,is_other,roberta,icd9_code,unnamed: 0,tsurg,med,...,ethnicity_patient declined to answer,ethnicity_portuguese,ethnicity_south american,ethnicity_unable to obtain,ethnicity_unknown/not specified,ethnicity_white,ethnicity_white - brazilian,ethnicity_white - eastern european,ethnicity_white - other european,ethnicity_white - russian
6249,8477,154902,1,0,0,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",27801,10135,0,0,...,0,0,0,0,0,1,0,0,0,0
38563,81212,130156,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",8208,51868,0,1,...,0,0,0,0,0,1,0,0,0,0
36134,94049,183737,1,0,0,"[0, 49329, 9167, 12478, 10566, 35, 1437, 646, ...",51881,55875,0,1,...,0,0,0,0,0,1,0,0,0,0
53051,49434,146834,0,1,0,"[0, 48759, 10980, 4, 646, 12606, 47186, 94, 13...",9351,41817,0,1,...,0,0,0,1,0,0,0,0,0,0
21180,16590,151831,1,0,0,"[0, 48759, 9167, 12478, 10566, 35, 1437, 646, ...",4329,19796,0,0,...,0,0,0,0,0,1,0,0,0,0


In [199]:
train_roberta_features_df = train_roberta_x.drop(columns=['unnamed: 0','subject_id','hadm_id','diagnosis','icd9_code'])
train_roberta_features = np.array(train_roberta_features_df['roberta'].values.tolist())
print(train_roberta_features.shape)
train_roberta_features

(34416, 512)


array([[    0, 49329,  9167, ...,   398, 12606,     2],
       [    0, 48759,  9167, ..., 10831,  3023,     2],
       [    0, 49329,  9167, ...,     4,  8432,     2],
       ...,
       [    0, 48759,  9167, ...,  1437,   590,     2],
       [    0, 48759,  9167, ..., 15671,  1638,     2],
       [    0, 48759,  9167, ...,   396,   143,     2]])

In [201]:
df_array = train_roberta_features_df.drop(columns=['roberta']).values
print(df_array.shape)
df_array

(34416, 252)


array([[1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.]])

In [173]:
train_roberta_features_full = np.concatenate((train_roberta_features,df_array),axis=1)
train_roberta_features_full.shape

(34416, 766)

In [186]:
train_roberta_features_full

array([[0, 49329, 9167, ..., 0, 0, 0],
       [0, 48759, 9167, ..., 0, 0, 0],
       [0, 49329, 9167, ..., 0, 0, 0],
       ...,
       [0, 48759, 9167, ..., 0, 0, 0],
       [0, 48759, 9167, ..., 0, 0, 0],
       [0, 48759, 9167, ..., 0, 0, 0]], dtype=object)

In [211]:
def create_npz_features(df, bert_type, file_name):
    """
    Create sparse npz feature arrays from pandas dataframes
    df (DataFrame): Pandas DataFrame with features
    """
    clean_df = df.drop(columns=['unnamed: 0','subject_id','hadm_id','diagnosis','icd9_code'])
    bert = np.array(clean_df[bert_type].values.tolist())
    df_array = clean_df.drop(columns=[bert_type]).values
    full_features = np.concatenate((bert,df_array),axis=1)
    sparse_features = sparse.csr_matrix(full_features)
    features_file = DATA_DIR + file_name
    #print(features_file)
    sparse.save_npz(features_file, sparse_features)
    return full_features


In [213]:
train_roberta_features = create_npz_features(train_roberta_x, 'roberta','/train_roberta_features')
print(train_roberta_features.shape)

./data/train_roberta_features
(34416, 764)


In [215]:
dev_roberta_features = create_npz_features(val_roberta_x, 'roberta','/dev_roberta_features')
print(dev_roberta_features.shape)

./data/dev_roberta_features
(11473, 764)


In [216]:
test_roberta_features = create_npz_features(test_roberta_x, 'roberta','/test_roberta_features')
print(test_roberta_features.shape)

./data/test_roberta_features
(11473, 764)


In [162]:
train_roberta_y.head()

Unnamed: 0,readmit,stay_length_sec
6249,0,10620
38563,0,39000
36134,0,3180
53051,0,50460
21180,0,80040


In [56]:
def create_npz_features_nobert(df, bert_type, file_name):
    """
    Create sparse npz feature arrays from pandas dataframes
    df (DataFrame): Pandas DataFrame with features
    """
    clean_df = df.drop(columns=['unnamed: 0','subject_id','hadm_id','diagnosis','icd9_code',bert_type]).values
    sparse_features = sparse.csr_matrix(clean_df)
    features_file = DATA_DIR + file_name
    #print(features_file)
    sparse.save_npz(features_file, sparse_features)
    return clean_df

In [58]:
train_nobert_features = create_npz_features_nobert(train_roberta_x, 'roberta','/train_nobert_features')
dev_nobert_features = create_npz_features_nobert(val_roberta_x, 'roberta','/dev_nobert_features')
test_nobert_features = create_npz_features_nobert(test_roberta_x, 'roberta','/test_nobert_features')
test_nobert_features[0:5,0:5]

array([[1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 1.],
       [1., 0., 0., 0., 0.]])

In [59]:
test_nobert_features.shape

(11473, 252)

In [43]:
def create_npz_labels(df, label, file_name):
    """
    Create sparse npz label arrays from pandas dataframes
    df (DataFrame): Pandas DataFrame with labels
    """
    label = np.array(df[label].values.tolist())

    sparse_label = sparse.csr_matrix(label)
    label_file = DATA_DIR + file_name
    #print(features_file)
    sparse.save_npz(label_file, sparse_label)
    return label

In [218]:
train_readmit_labels = create_npz_labels(train_roberta_y, 'readmit','/train_readmit_labels')
train_readmit_labels.shape

(34416,)

In [219]:
dev_readmit_labels = create_npz_labels(val_roberta_y, 'readmit','/dev_readmit_labels')
dev_readmit_labels.shape

(11473,)

In [220]:
test_readmit_labels = create_npz_labels(test_roberta_y, 'readmit','/test_readmit_labels')
test_readmit_labels.shape

(11473,)

Create BioClinicalBERT feature and label files

In [4]:
train_bc_512_x = pd.read_pickle('./data/bc_512_los_read/bc_512_los_read_x_train.pkl')
train_bc_512_y = pd.read_pickle('./data/bc_512_los_read/bc_512_los_read_y_train.pkl')
val_bc_512_x = pd.read_pickle('./data/bc_512_los_read/bc_512_los_read_x_val.pkl')
val_bc_512_y = pd.read_pickle('./data/bc_512_los_read/bc_512_los_read_y_val.pkl')
test_bc_512_x = pd.read_pickle('./data/bc_512_los_read/bc_512_los_read_x_test.pkl')
test_bc_512_y = pd.read_pickle('./data/bc_512_los_read/bc_512_los_read_y_test.pkl')

In [225]:
train_bc_512_features = create_npz_features(train_bc_512_x, 'bc_512','/train_bc_512_features')
dev_bc_512_features = create_npz_features(val_bc_512_x, 'bc_512','/dev_bc_512_features')
test_bc_512_features = create_npz_features(test_bc_512_x, 'bc_512','/test_bc_512_features')


./data/train_bc_512_features
./data/dev_bc_512_features
./data/test_bc_512_features


In [33]:
print(train_bc_512_y.shape)
train_bc_512_y.head()

(34416, 2)


Unnamed: 0,readmit,stay_length_sec
6249,0,10620
38563,0,39000
36134,0,3180
53051,0,50460
21180,0,80040


In [36]:
# fix stay_length labels I messed up
train_hadmids = train_bc_512_x.loc[:,('hadm_id','subject_id')].reset_index()
train_hadmids = train_hadmids.merge(labels_final, left_on=['hadm_id','subject_id'],\
    right_on=['HADM_ID','SUBJECT_ID'])
print(train_hadmids.shape)
train_hadmids.head()

(34416, 8)


Unnamed: 0,index,hadm_id,subject_id,SUBJECT_ID,HADM_ID,READMIT,STAY_LENGTH_SEC,ICD9_CODE
0,6249,154902,8477,8477,154902,0,529020.0,27801
1,38563,130156,81212,81212,130156,0,903000.0,8208
2,36134,183737,94049,94049,183737,0,175980.0,51881
3,53051,146834,49434,49434,146834,0,50460.0,9351
4,21180,151831,16590,16590,151831,0,425640.0,4329


In [37]:
train_labels = train_hadmids.merge(train_bc_512_y.reset_index(), on=['index'])
print(train_labels.shape)
train_labels.head()

(34416, 10)


Unnamed: 0,index,hadm_id,subject_id,SUBJECT_ID,HADM_ID,READMIT,STAY_LENGTH_SEC,ICD9_CODE,readmit,stay_length_sec
0,6249,154902,8477,8477,154902,0,529020.0,27801,0,10620
1,38563,130156,81212,81212,130156,0,903000.0,8208,0,39000
2,36134,183737,94049,94049,183737,0,175980.0,51881,0,3180
3,53051,146834,49434,49434,146834,0,50460.0,9351,0,50460
4,21180,151831,16590,16590,151831,0,425640.0,4329,0,80040


In [38]:
# fix stay_length labels I messed up
val_hadmids = val_bc_512_x.loc[:,('hadm_id','subject_id')].reset_index()
val_hadmids = val_hadmids.merge(labels_final, left_on=['hadm_id','subject_id'],\
    right_on=['HADM_ID','SUBJECT_ID'])
print(val_hadmids.shape)
val_labels = val_hadmids.merge(val_bc_512_y.reset_index(), on=['index'])
print(val_labels.shape)
val_labels.head()

(11473, 8)
(11473, 10)


Unnamed: 0,index,hadm_id,subject_id,SUBJECT_ID,HADM_ID,READMIT,STAY_LENGTH_SEC,ICD9_CODE,readmit,stay_length_sec
0,23607,160871,11435,11435,160871,0,964020.0,V3000,0,13620
1,44689,157653,29566,29566,157653,0,184920.0,5070,0,12120
2,2245,151669,14755,14755,151669,0,1202160.0,51881,0,78960
3,42100,139922,14985,14985,139922,0,148200.0,25013,0,61800
4,27860,114716,16921,16921,114716,0,2321040.0,80116,0,74640


In [39]:
# fix stay_length labels I messed up
test_hadmids = test_bc_512_x.loc[:,('hadm_id','subject_id')].reset_index()
test_hadmids = test_hadmids.merge(labels_final, left_on=['hadm_id','subject_id'],\
    right_on=['HADM_ID','SUBJECT_ID'])
print(test_hadmids.shape)
test_labels = test_hadmids.merge(test_bc_512_y.reset_index(), on=['index'])
print(test_labels.shape)
test_labels.head()

(11473, 8)
(11473, 10)


Unnamed: 0,index,hadm_id,subject_id,SUBJECT_ID,HADM_ID,READMIT,STAY_LENGTH_SEC,ICD9_CODE,readmit,stay_length_sec
0,19692,108195,28114,28114,108195,0,80100.0,9352,0,80100
1,4046,169761,82512,82512,169761,0,965340.0,4241,0,14940
2,39366,199046,90414,90414,199046,0,1102200.0,99666,0,65400
3,7578,107242,30575,30575,107242,1,439500.0,51881,1,7500
4,41771,181637,14080,14080,181637,0,685920.0,85226,0,81120


In [40]:
def lower_col_names(df):
    '''
    lowercase the column names of a pd df
    Input: pd dataframe
    Output: lower - a pd dataframe with lowercase columns 
    '''
    df.columns= df.columns.str.strip().str.lower()
    
    return df

In [41]:
train_labels_fin = lower_col_names(train_labels.loc[:,('subject_id','hadm_id','READMIT','STAY_LENGTH_SEC')])
val_labels_fin = lower_col_names(val_labels.loc[:,('subject_id','hadm_id','READMIT','STAY_LENGTH_SEC')])
test_labels_fin = lower_col_names(test_labels.loc[:,('subject_id','hadm_id','READMIT','STAY_LENGTH_SEC')])

In [46]:
test_labels_fin.head()

Unnamed: 0,subject_id,hadm_id,readmit,stay_length_sec
0,28114,108195,0,80100.0
1,82512,169761,0,965340.0
2,90414,199046,0,1102200.0
3,30575,107242,1,439500.0
4,14080,181637,0,685920.0


In [44]:
train_stay_len_labels = create_npz_labels(train_labels_fin, 'stay_length_sec','/train_stay_len_labels')
dev_stay_len_labels = create_npz_labels(val_labels_fin, 'stay_length_sec','/dev_stay_len_labels')
test_stay_len_labels = create_npz_labels(test_labels_fin, 'stay_length_sec','/test_stay_len_labels')

In [48]:
train_labels_fin.to_pickle('./data/train_labels.pkl')
val_labels_fin.to_pickle('./data/val_labels.pkl')
test_labels_fin.to_pickle('./data/test_labels.pkl')

In [233]:
# confirm same labels in all y tables
train_bc_512_y.head()

Unnamed: 0,readmit,stay_length_sec
6249,0,10620
38563,0,39000
36134,0,3180
53051,0,50460
21180,0,80040


In [232]:
train_roberta_y.head()

Unnamed: 0,readmit,stay_length_sec
6249,0,10620
38563,0,39000
36134,0,3180
53051,0,50460
21180,0,80040


In [234]:
train_roberta_y['stay_length_sec'].mean()

45351.504532775456

Try to join full BioClinicalBERT records

In [5]:
sparse_bc = sparse.load_npz('./data/sparse_bc.npz').todense()
print('features shape:',sparse_bc.shape)
#labels = np.load(labels_filepath, allow_pickle=True)["arr_0"]
#labels = np.asarray(sparse.load_npz(labels_filepath).todense()).ravel()

features shape: (58361, 23597)


In [6]:
sparse_bc_df = pd.DataFrame(sparse_bc)
#sparse_bc_df['idx'] = sparse_bc_df.index
sparse_bc_df.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,23587,23588,23589,23590,23591,23592,23593,23594,23595,23596
0,101,164,112,10296,2236,131,164,115,115,18615,...,0,0,0,0,0,0,0,0,0,0
1,101,164,112,10296,2236,131,164,115,115,20915,...,0,0,0,0,0,0,0,0,0,0
2,101,164,112,10296,2236,131,164,115,115,20915,...,0,0,0,0,0,0,0,0,0,0
3,101,164,107,10296,2236,131,164,115,115,19538,...,0,0,0,0,0,0,0,0,0,0
4,101,164,112,10296,2236,131,164,115,115,22148,...,0,0,0,0,0,0,0,0,0,0


In [7]:
best_note_df['idx'] = best_note_df.index
notes_hadmid = best_note_df.loc[:,('HADM_ID', 'idx')]
notes_hadmid.head()

Unnamed: 0,HADM_ID,idx
0,167853,0
1,107527,1
2,167118,2
3,196489,3
4,135453,4


In [8]:
notes_hadmid_train = train_bc_512_x.merge(notes_hadmid, left_on=['hadm_id'], right_on=['HADM_ID'])
print(notes_hadmid_train.shape)
notes_hadmid_train.head(1)

(34416, 260)


Unnamed: 0,subject_id,hadm_id,is_discharge,is_nursing,is_other,bc_512,icd9_code,unnamed: 0,tsurg,med,...,ethnicity_south american,ethnicity_unable to obtain,ethnicity_unknown/not specified,ethnicity_white,ethnicity_white - brazilian,ethnicity_white - eastern european,ethnicity_white - other european,ethnicity_white - russian,HADM_ID,idx
0,8477,154902,1,0,0,"[101, 164, 107, 10296, 2236, 131, 164, 115, 11...",27801,10135,0,0,...,0,0,0,1,0,0,0,0,154902,6377


In [9]:
train_idx = notes_hadmid_train['idx']
train_idx

0         6377
1        39183
2        36737
3        53932
4        21602
         ...  
34411    34608
34412    47559
34413    18280
34414    42962
34415    21567
Name: idx, Length: 34416, dtype: int64

In [10]:
train_bc_full = sparse_bc_df[sparse_bc_df.index.isin(train_idx)]

In [11]:
train_bc_full.shape

(34416, 23597)

In [12]:
train_df_struc = notes_hadmid_train.set_index('idx')
train_df_struc = train_df_struc.drop(columns=['unnamed: 0','subject_id','hadm_id','diagnosis','icd9_code','bc_512','unnamed: 5','HADM_ID'])
train_df_struc.head()

Unnamed: 0_level_0,is_discharge,is_nursing,is_other,tsurg,med,cmed,traum,nmed,vsurg,csurg,...,ethnicity_patient declined to answer,ethnicity_portuguese,ethnicity_south american,ethnicity_unable to obtain,ethnicity_unknown/not specified,ethnicity_white,ethnicity_white - brazilian,ethnicity_white - eastern european,ethnicity_white - other european,ethnicity_white - russian
idx,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6377,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
39183,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
36737,1,0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
53932,0,1,0,0,1,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
21602,1,0,0,0,0,0,0,1,0,0,...,0,0,0,0,0,1,0,0,0,0


In [13]:
train_bc_full_half = train_bc_full.iloc[:18000,:]
train_bc_full_half.shape

(18000, 23597)

In [14]:
train_bc_features_half_df = train_bc_full_half.join(train_df_struc)

In [14]:
train_bc_features_df = train_bc_full.join(train_df_struc)

In [12]:
train_bc_full_np = train_bc_full.values

In [13]:
train_bc_full_np.shape

(34416, 23597)

In [15]:
train_df_np = train_bc_512_x.drop(columns=['unnamed: 0','subject_id','hadm_id','diagnosis','icd9_code','bc_512']).values
train_df_np.shape

(34416, 252)

In [16]:
train_bc_features = np.concatenate((train_bc_full_np,train_df_np),axis=1)

In [13]:
def create_bc_full_npz(df, bert_df, bert_type, file_name):
    df_array = df.drop(columns=['unnamed: 0','subject_id','hadm_id','diagnosis','icd9_code',bert_type]).values
    bert = bert_df.values
    print(bert.shape)
    full_features = np.concatenate((bert,df_array),axis=1)
    sparse_features = sparse.csr_matrix(full_features)
    features_file = DATA_DIR + file_name
    #print(features_file)
    sparse.save_npz(features_file, sparse_features)
    return full_features

In [14]:
train_bc_features = create_bc_full_npz(train_bc_512_x, train_bc_full, 'bc_512', '/train_bc_features')
train_bc_features.shape

(34416, 23597)


In [None]:
sparse_tojoin = sparse_bc_df.loc[:,('idx','')]

In [1]:
best_note_idx = best_note_df.reset_index()

NameError: name 'best_note_df' is not defined

In [None]:
bioclinic_full = best_note_df.join(sparse_bc_df)