In [161]:
import psycopg2
from datetime import timedelta
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sktime.transformations.panel.rocket import Rocket
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

In [2]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="mimic",
    user="postgres",
    password="postgres"
)

cur = conn.cursor()

cur.execute("SELECT version();")
print(cur.fetchone())

('PostgreSQL 15.2, compiled by Visual C++ build 1914, 64-bit',)


In [3]:
# Connect to db
conn = psycopg2.connect(host='localhost', dbname='mimic', user='postgres', password='postgres', options='-c search_path=mimiciii')
#conn = psycopg2.connect(dbname='mimic', user='postgres')
cur = conn.cursor() 

# Read in table with patients & admissions (inner join on subject_id) and icu_stays (inner joinon subject_id and hadm_id)
icustay_details = pd.read_sql_query("SELECT * FROM mimiciii.flicu_icustay_detail;", conn)

# Read in vital and lab signs
pivoted_vital = pd.read_sql_query("SELECT * FROM mimiciii.pivoted_vital;", conn)
pivoted_lab = pd.read_sql_query("SELECT * FROM mimiciii.ckd_pivoted_lab;", conn)

# Read in lab measurements
# Use flicu_pivoted_lab (as it only takes the lab tests during ICU stay)
#query = "SELECT * FROM mimiciii.flicu_pivoted_lab;"
# Alternative:  Use the lab values recorded previous to the ICU stay (although during same hospital admission!), 
# then sample them  (8h intervalls) and then forward fill plus cap at either icu admission time of first vital sign recorded
#query = "SELECT * FROM mimiciii.pivoted_lab;"
#pivoted_lab = pd.read_sql_query(query, conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()



In [4]:
pivoted_vital['pedaledema'].unique()

array([nan,  3.,  2.])

In [5]:
pivoted_vital.shape

(9207039, 15)

In [6]:
pivoted_vital.columns

Index(['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity',
       'pedaledema', 'appetite_median', 'ckd'],
      dtype='object')

In [7]:
pivoted_lab.columns

Index(['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria', 'ckd'],
      dtype='object')

In [8]:
icustay_details.describe()

Unnamed: 0,subject_id,hadm_id,icustay_id,los_hospital,admission_age,hospital_expire_flag,hospstay_seq,los_icu,icustay_seq,label_death_icu,label_cor_art,diabetes_mellitus,ckd,anemia_flag
count,61051.0,61051.0,61051.0,61051.0,61051.0,61051.0,61051.0,61041.0,61051.0,61051.0,61051.0,61051.0,61051.0,61051.0
mean,33961.698989,149946.928945,249968.598696,11.320283,64.856674,0.107975,1.418568,4.931644,1.070908,0.073774,0.212838,0.170693,0.082849,0.125682
std,28153.637888,28899.070114,28891.923533,14.301661,56.970061,0.310352,1.510997,9.664428,0.301838,0.261406,0.409318,0.376244,0.275656,0.331493
min,2.0,100001.0,200001.0,-0.945139,7e-06,0.0,1.0,0.000139,1.0,0.0,0.0,0.0,0.0,0.0
25%,12085.5,124949.0,224951.0,3.910069,44.281191,0.0,1.0,1.109491,1.0,0.0,0.0,0.0,0.0,0.0
50%,24352.0,149883.0,249949.0,6.945833,62.054949,0.0,1.0,2.094815,1.0,0.0,0.0,0.0,0.0,0.0
75%,54366.0,174997.5,274974.5,13.059722,76.068514,0.0,1.0,4.502199,1.0,0.0,0.0,0.0,0.0,0.0
max,99999.0,199999.0,299999.0,294.660417,311.561027,1.0,41.0,173.072512,7.0,1.0,1.0,1.0,1.0,1.0


#### Setting window length 

In [9]:
WINDOW_LENGTH = 24*4

### Keeping records that are atleast window length

In [10]:
data= icustay_details.copy()
data = data[data.los_icu >= WINDOW_LENGTH/24.0]

In [11]:
filtered_icustay_ids = pd.DataFrame(data['icustay_id'].unique(), columns=['icustay_id'])

In [12]:
# Drop measurements with no belonging icustay_id
pivoted_vital = pivoted_vital.dropna(subset=['icustay_id'])
pivoted_lab = pivoted_lab.dropna(subset=['icustay_id'])

#check the shape 
print(pivoted_vital.shape)

# Cast icustay_id types to int
pivoted_vital['icustay_id'] = pivoted_vital['icustay_id'].astype(int)
pivoted_lab['icustay_id'] = pivoted_lab['icustay_id'].astype(int)

# Keep only values of patients in previously filtered icustay_ids in labs and vitals
pivoted_vital = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
pivoted_lab = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()

(9207039, 15)


In [13]:
print(pivoted_vital.shape)

(6724403, 15)


In [14]:
# Min of each lab and vitals
icustay_ids_charttime_min_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
icustay_ids_charttime_min_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
# Min of both combined
icustay_ids_charttime_min_vital_lab = pd.concat([icustay_ids_charttime_min_lab, icustay_ids_charttime_min_vital], ignore_index=True)
icustay_ids_charttime_min_vital_lab = icustay_ids_charttime_min_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_min_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]

# Max of each lab and vitals
icustay_ids_charttime_max_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
icustay_ids_charttime_max_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
# Max of both combined
icustay_ids_charttime_max_vital_lab = pd.concat([icustay_ids_charttime_max_lab, icustay_ids_charttime_max_vital], ignore_index=True)
icustay_ids_charttime_max_vital_lab = icustay_ids_charttime_max_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_max_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]


In [15]:
# Find for which icustay_ids there exist at least WINDOW_LENGTH of data
icustay_ids_vital_lab_charttime_min_max = pd.concat([icustay_ids_charttime_max_vital_lab, icustay_ids_charttime_min_vital_lab], ignore_index=True)
time_window = timedelta(days=4, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
is_time_diff_bigger_window_lab = icustay_ids_vital_lab_charttime_min_max.groupby(['icustay_id'])['charttime'].transform(lambda x: (x.max()-x.min())) >= time_window

icustay_ids_vital_lab_charttime_min_max_filtered = icustay_ids_vital_lab_charttime_min_max[is_time_diff_bigger_window_lab]
print("Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering", icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].nunique())

# Keep only icustay ids for which at least WINDOW_LENGTH of data exists
icustay_ids_time_filtered = pd.DataFrame(icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].unique(), columns=['icustay_id'])
print("Unique icu stays in icustay_ids_time_filtered: ", icustay_ids_time_filtered['icustay_id'].nunique())

Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering 8409
Unique icu stays in icustay_ids_time_filtered:  8409


In [16]:
filtered_icustay_ids = filtered_icustay_ids.merge(icustay_ids_time_filtered, on='icustay_id', how='inner').drop_duplicates()

In [17]:
demographics_filtered = data.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays demographics: ", demographics_filtered['icustay_id'].nunique())

vital_filtered = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays vitals: ", vital_filtered['icustay_id'].nunique())

lab_filtered = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays labs: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays demographics:  8409
Number of ICU stays vitals:  8409
Number of ICU stays labs:  8409


In [18]:
demographics_filtered.columns

Index(['subject_id', 'hadm_id', 'icustay_id', 'gender', 'dod', 'admittime',
       'dischtime', 'los_hospital', 'admission_age', 'ethnicity',
       'ethnicity_grouped', 'hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq',
       'first_icu_stay_current_hosp', 'first_icu_stay_patient',
       'first_careunit', 'deathtime_icu', 'label_death_icu', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag'],
      dtype='object')

In [19]:
lab_filtered.columns

Index(['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria', 'ckd'],
      dtype='object')

In [20]:
lab_filtered['icustay_id'].unique()

array([218958, 241427, 272085, ..., 226042, 270667, 224889], dtype=int64)

In [21]:
vital_filtered = vital_filtered.merge(lab_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", vital_filtered['icustay_id'].nunique())
lab_filtered = lab_filtered.merge(vital_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays in lab_filtered:  8409
Number of ICU stays in lab_filtered:  8409


In [22]:
vital_resampled = vital_filtered.copy()

# Resample from the end of the time series (how="last")
vital_resampled = vital_resampled.assign(charttime=vital_resampled.charttime.dt.round('H'))
#vital_resampled = vital_resampled.set_index('charttime').groupby('icustay_id').resample('1H', origin="end").median().drop(['icustay_id'], axis = 1).reset_index()
# Resample from the beginning of the time series
vital_resampled = vital_resampled.set_index('charttime').groupby('icustay_id').resample('1H', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use lambda function instead of directly applying it to groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1)
vital_col = vital_resampled.columns.drop(['icustay_id', 'charttime'])
vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()
#.fillna(value=vital_resampled[vital_columns].mean())#.fillna(0)#.fillna(-1)


  vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()


In [23]:
lab_resampled = lab_filtered.copy()
# Cut out minutes and hours, so that the resampling of the 8h takes the same time span as the 1h samples (for vitals)
lab_resampled = lab_resampled.assign(charttime=lab_resampled.charttime.dt.round('H'))
# Resample from the end of the time series 
#lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="end").median().drop(['icustay_id'], axis = 1).reset_index()
lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use transform instead of direct groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1 or 0 or mean!?)
lab_col = lab_resampled.columns.drop(['icustay_id', 'charttime'])
lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()

print(lab_resampled.isnull().sum().sum())

  lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()


730


### keep only uptil 4 days data

In [24]:
delta_t_data = timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
demographics_windowed = demographics_filtered.copy()
demographics_windowed['predtime'] = demographics_windowed.intime + delta_t_data
demographics_windowed['delta_t_pred'] = demographics_windowed.outtime - demographics_windowed.predtime

demographics_windowed[['subject_id', 'icustay_id', 'intime', 'predtime', 'delta_t_pred']].head(5)

Unnamed: 0,subject_id,icustay_id,intime,predtime,delta_t_pred
0,14137,218958,2132-08-24 17:07:00,2132-08-28 17:07:00,10 days 00:34:00
1,28970,241427,2151-11-07 01:49:18,2151-11-11 01:49:18,27 days 17:57:20
2,2269,272085,2187-12-04 11:35:04,2187-12-08 11:35:04,12 days 02:08:02
3,334,214236,2136-01-16 10:56:48,2136-01-20 10:56:48,10 days 07:21:18
4,2005,285731,2163-06-23 11:28:06,2163-06-27 11:28:06,5 days 08:45:56


In [25]:
cut_icustay_ids = pd.DataFrame(demographics_windowed['icustay_id'].unique(), columns=['icustay_id'])
print("Number of ICU stays: ", cut_icustay_ids['icustay_id'].count())

vitals_cut = vital_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in vitals_cut: ", vitals_cut['icustay_id'].nunique())

labs_cut = lab_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in labs_cut: ", labs_cut['icustay_id'].nunique())


Number of ICU stays:  8409
Number of ICU stays in vitals_cut:  8409
Number of ICU stays in labs_cut:  8409


In [26]:
print(delta_t_data)

4 days, 0:00:00


In [27]:
vitals_windowed = vital_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
vitals_windowed = vitals_windowed[vitals_windowed.charttime < vitals_windowed.predtime]
print("Number of ICU stays in vitals_windowed: ", vitals_windowed['icustay_id'].nunique())

labs_windowed = lab_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
labs_windowed = labs_windowed[labs_windowed.charttime < labs_windowed.predtime]
print("Number of ICU stays in labs_windowed: ", labs_windowed['icustay_id'].nunique())

windowed_icustay_ids = pd.DataFrame(pd.concat([vitals_windowed['icustay_id'], labs_windowed['icustay_id']]).unique(), columns=['icustay_id'])
demographics_windowed = demographics_windowed.merge(windowed_icustay_ids, on='icustay_id', how='right')

Number of ICU stays in vitals_windowed:  8405
Number of ICU stays in labs_windowed:  8405


In [28]:
labs_windowed.isna().sum()

icustay_id        0
charttime         0
subject_id        0
aniongap          0
albumin           0
bands             0
bicarbonate       0
bilirubin         0
creatinine        0
chloride          0
glucose           0
hematocrit        0
hemoglobin        0
lactate           0
platelet          0
potassium         0
ptt               0
inr               0
pt                0
sodium            0
bun               0
wbc               0
bacteria          0
ckd             162
predtime          0
delta_t_pred      0
dtype: int64

In [29]:
vitals_windowed.isna().sum()

icustay_id              0
charttime               0
heartrate               0
sysbp                   0
diasbp                  0
meanbp                  0
resprate                0
tempc                   0
spo2                    0
glucose                 0
rbc                     0
specificgravity         0
pedaledema         812438
appetite_median         0
ckd                  1481
predtime                0
delta_t_pred            0
dtype: int64

#### using icustay_id from demographics to fill missing ckd in vitals and labs

In [30]:
vitals_windowed['ckd'] = vitals_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [31]:
vitals_windowed.isna().sum()

icustay_id              0
charttime               0
heartrate               0
sysbp                   0
diasbp                  0
meanbp                  0
resprate                0
tempc                   0
spo2                    0
glucose                 0
rbc                     0
specificgravity         0
pedaledema         812438
appetite_median         0
ckd                     0
predtime                0
delta_t_pred            0
dtype: int64

In [32]:
labs_windowed['ckd'] = labs_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [33]:
labs_windowed.isna().sum()

icustay_id      0
charttime       0
subject_id      0
aniongap        0
albumin         0
bands           0
bicarbonate     0
bilirubin       0
creatinine      0
chloride        0
glucose         0
hematocrit      0
hemoglobin      0
lactate         0
platelet        0
potassium       0
ptt             0
inr             0
pt              0
sodium          0
bun             0
wbc             0
bacteria        0
ckd             0
predtime        0
delta_t_pred    0
dtype: int64

#### Some patients might not have any value for pedaledema and hence we are filling those with -1

In [34]:
demographics_windowed.isna().sum()

subject_id                        0
hadm_id                           0
icustay_id                        0
gender                            0
dod                            4663
admittime                         0
dischtime                         0
los_hospital                      0
admission_age                     0
ethnicity                         0
ethnicity_grouped                 0
hospital_expire_flag              0
hospstay_seq                      0
first_hosp_stay                   0
intime                            0
outtime                           0
los_icu                           0
icustay_seq                       0
first_icu_stay_current_hosp       0
first_icu_stay_patient            0
first_careunit                    0
deathtime_icu                  7283
label_death_icu                   0
label_cor_art                     0
diabetes_mellitus                 0
ckd                               0
anemia_flag                       0
predtime                    

In [35]:
vitals_windowed =vitals_windowed.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(-1).reset_index()

In [36]:
vitals_windowed.isna().sum()

icustay_id         0
charttime          0
heartrate          0
sysbp              0
diasbp             0
meanbp             0
resprate           0
tempc              0
spo2               0
glucose            0
rbc                0
specificgravity    0
pedaledema         0
appetite_median    0
ckd                0
dtype: int64

In [37]:
print("Number of ICU stays demographics: ", demographics_windowed['icustay_id'].nunique())
print("Number of CKD demographics:\n", demographics_windowed['ckd'].value_counts())

print("Number of ICU stays vitals: ", vitals_windowed['icustay_id'].nunique())
print("Number of CKD vitals:\n", vitals_windowed['ckd'].value_counts())

print("Number of ICU stays labs: ", labs_windowed['icustay_id'].nunique())
print("Number of CKD labs:\n", labs_windowed['ckd'].value_counts())

Number of ICU stays demographics:  8405
Number of CKD demographics:
 0    7868
1     537
Name: ckd, dtype: int64
Number of ICU stays vitals:  8405
Number of CKD vitals:
 0    760776
1     51662
Name: ckd, dtype: int64
Number of ICU stays labs:  8405
Number of CKD labs:
 0    98884
1     6658
Name: ckd, dtype: int64


# static model- Random Forest

In [38]:
def aggregate_dataframe(df, groupby_key, columns_to_aggregate):
    df = df.replace(-1, np.nan)
    result = df.groupby(groupby_key)[columns_to_aggregate].mean().reset_index()    
    return result

In [39]:
columns_to_merge = ['icustay_id', 'ckd','ethnicity_grouped']

In [40]:
df_cols_vitals = ['heartrate', 'sysbp','diasbp','meanbp','resprate','tempc','spo2','specificgravity','pedaledema','appetite_median']
df_agg_vitals = aggregate_dataframe(vitals_windowed, 'icustay_id', df_cols_vitals)

df_agg_vitals = df_agg_vitals.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_vitals['ckd_ethnicity'] = df_agg_vitals['ckd'].astype(str).str.cat(df_agg_vitals['ethnicity_grouped'].astype(str))

In [41]:
df_cols_labs = ['albumin','bacteria','glucose','bun','creatinine','sodium','potassium','hemoglobin','wbc','hematocrit','platelet','ptt']
df_agg_labs = aggregate_dataframe(labs_windowed, 'icustay_id', df_cols_labs)

df_agg_labs = df_agg_labs.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_labs['ckd_ethnicity'] = df_agg_labs['ckd'].astype(str).str.cat(df_agg_labs['ethnicity_grouped'].astype(str))

In [42]:
df_agg_vitals.head()

Unnamed: 0,icustay_id,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,specificgravity,pedaledema,appetite_median,ckd,ethnicity_grouped,ckd_ethnicity
0,200017,155.052083,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0,0,white,0white
1,200033,79.126316,122.147368,71.007895,87.844737,17.113158,36.819298,96.394737,1.02,,2.0,0,white,0white
2,200037,142.943878,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0,0,white,0white
3,200045,87.118557,129.762887,47.337629,75.81186,21.729381,36.797824,98.525773,1.02,,3.0,0,white,0white
4,200046,147.221649,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0,0,black,0black


In [43]:
df_agg_vitals['ckd_ethnicity'].value_counts()

0white               5376
0unknown             1251
0black                681
1white                405
0hispanic             279
0asian                245
1black                 64
1unknown               35
0alaska_native         17
1hispanic              16
1asian                 13
0portuguese            12
0middle_eastern         5
0pacific_islander       2
1middle_eastern         2
1portuguese             1
1alaska_native          1
Name: ckd_ethnicity, dtype: int64

In [44]:
df_agg_labs.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,hematocrit,platelet,ptt,ckd,ethnicity_grouped,ckd_ethnicity
0,200017,2.6,2.0,125.0,29.0,1.0,141.416667,5.066667,16.8,5.2,53.1,267.0,36.0,0,white,0white
1,200033,2.7,1.0,151.916667,15.833333,0.675,135.916667,3.670833,12.729167,8.266667,36.725,140.208333,25.1,0,white,0white
2,200037,2.6,2.0,125.0,29.0,1.0,145.230769,5.484615,9.8,11.5,29.1,212.0,36.0,0,white,0white
3,200045,2.5,2.0,74.846154,23.923077,0.861538,147.461538,4.623077,10.369231,14.730769,31.3,237.230769,26.5,0,white,0white
4,200046,2.6,2.0,125.0,29.0,1.0,138.384615,5.023077,16.392308,3.403846,50.569231,263.307692,36.0,0,black,0black


In [45]:
df_agg_vitals.shape

(8405, 14)

In [46]:
df_agg_labs.shape

(8405, 16)

In [47]:
demographics_windowed.shape

(8405, 29)

In [48]:
print("Vitals unique icustay id: ",len(df_agg_vitals['icustay_id'].unique()),"\nLabs unique icustay id: ",len(df_agg_labs['icustay_id'].unique()),"\nDemographics unique icustay id: ",len(demographics_windowed['icustay_id'].unique()))

Vitals unique icustay id:  8405 
Labs unique icustay id:  8405 
Demographics unique icustay id:  8405


In [49]:
df_agg_vitals.head()

Unnamed: 0,icustay_id,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,specificgravity,pedaledema,appetite_median,ckd,ethnicity_grouped,ckd_ethnicity
0,200017,155.052083,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0,0,white,0white
1,200033,79.126316,122.147368,71.007895,87.844737,17.113158,36.819298,96.394737,1.02,,2.0,0,white,0white
2,200037,142.943878,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0,0,white,0white
3,200045,87.118557,129.762887,47.337629,75.81186,21.729381,36.797824,98.525773,1.02,,3.0,0,white,0white
4,200046,147.221649,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0,0,black,0black


In [50]:
df_agg_vitals_new=df_agg_vitals.drop(['ckd','ethnicity_grouped','ckd_ethnicity'],axis=1)

In [51]:
df_agg_labs_new=df_agg_labs.drop(['ckd','ethnicity_grouped'],axis=1)

In [52]:
df_agg_vitals_new.head()

Unnamed: 0,icustay_id,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,specificgravity,pedaledema,appetite_median
0,200017,155.052083,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0
1,200033,79.126316,122.147368,71.007895,87.844737,17.113158,36.819298,96.394737,1.02,,2.0
2,200037,142.943878,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0
3,200045,87.118557,129.762887,47.337629,75.81186,21.729381,36.797824,98.525773,1.02,,3.0
4,200046,147.221649,120.0,58.5,78.0,20.5,37.111111,98.0,1.02,,3.0


In [53]:
df_agg_labs_new.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,hematocrit,platelet,ptt,ckd_ethnicity
0,200017,2.6,2.0,125.0,29.0,1.0,141.416667,5.066667,16.8,5.2,53.1,267.0,36.0,0white
1,200033,2.7,1.0,151.916667,15.833333,0.675,135.916667,3.670833,12.729167,8.266667,36.725,140.208333,25.1,0white
2,200037,2.6,2.0,125.0,29.0,1.0,145.230769,5.484615,9.8,11.5,29.1,212.0,36.0,0white
3,200045,2.5,2.0,74.846154,23.923077,0.861538,147.461538,4.623077,10.369231,14.730769,31.3,237.230769,26.5,0white
4,200046,2.6,2.0,125.0,29.0,1.0,138.384615,5.023077,16.392308,3.403846,50.569231,263.307692,36.0,0black


#### Merging all 3 tables together

In [54]:
merged_table = df_agg_labs_new.merge(df_agg_vitals_new, on='icustay_id', how='inner').merge(demographics_windowed, on='icustay_id', how='inner')

In [55]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'pedaledema',
       'appetite_median', 'subject_id', 'hadm_id', 'gender', 'dod',
       'admittime', 'dischtime', 'los_hospital', 'admission_age', 'ethnicity',
       'ethnicity_grouped', 'hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq',
       'first_icu_stay_current_hosp', 'first_icu_stay_patient',
       'first_careunit', 'deathtime_icu', 'label_death_icu', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag', 'predtime', 'delta_t_pred'],
      dtype='object')

In [56]:
merged_table.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,...,first_icu_stay_patient,first_careunit,deathtime_icu,label_death_icu,label_cor_art,diabetes_mellitus,ckd,anemia_flag,predtime,delta_t_pred
0,200017,2.6,2.0,125.0,29.0,1.0,141.416667,5.066667,16.8,5.2,...,True,NICU,NaT,0,0,0,0,0,2138-03-21 21:54:36,53 days 19:18:05
1,200033,2.7,1.0,151.916667,15.833333,0.675,135.916667,3.670833,12.729167,8.266667,...,True,SICU,2198-08-21 11:15:00,1,0,1,0,0,2198-08-11 17:56:17,9 days 21:03:01
2,200037,2.6,2.0,125.0,29.0,1.0,145.230769,5.484615,9.8,11.5,...,True,NICU,NaT,0,0,0,0,0,2141-08-11 09:29:48,14 days 07:47:08
3,200045,2.5,2.0,74.846154,23.923077,0.861538,147.461538,4.623077,10.369231,14.730769,...,False,SICU,NaT,0,0,1,0,0,2116-07-14 15:40:58,16 days 00:56:02
4,200046,2.6,2.0,125.0,29.0,1.0,138.384615,5.023077,16.392308,3.403846,...,True,NICU,NaT,0,0,0,0,0,2154-05-05 15:52:33,82 days 03:21:51


#### keeping records that have exactly 4 days data in both labs and vitals

In [57]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Filter the icustay_id where the difference is equal to 96 hours (4 days) in both labs and vitals
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=96)) & (vitals_diff == pd.Timedelta(hours=96))].index.tolist()

# Print the length of icustay_id
print(len(filtered_icustay_ids))

3038


In [58]:
merged_table= merged_table[merged_table['icustay_id'].isin(filtered_icustay_ids)]

In [59]:
merged_table.shape

(3038, 52)

In [60]:
merged_table['ckd'].value_counts()

0    2882
1     156
Name: ckd, dtype: int64

#### Dropping other irrelevent columns

In [61]:
merged_table=merged_table.drop(['subject_id','hadm_id','dod','admittime', 'dischtime','los_hospital','ethnicity','hospital_expire_flag','hospstay_seq', 'first_hosp_stay', 'intime','outtime', 'los_icu', 'icustay_seq', 'first_icu_stay_current_hosp','first_icu_stay_patient', 'first_careunit', 'deathtime_icu','label_death_icu', 'predtime', 'delta_t_pred'],axis=1)

In [62]:
merged_table.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,...,specificgravity,pedaledema,appetite_median,gender,admission_age,ethnicity_grouped,label_cor_art,diabetes_mellitus,ckd,anemia_flag
3,200045,2.5,2.0,74.846154,23.923077,0.861538,147.461538,4.623077,10.369231,14.730769,...,1.02,,3.0,F,73.941807,white,0,1,0,0
4,200046,2.6,2.0,125.0,29.0,1.0,138.384615,5.023077,16.392308,3.403846,...,1.02,,3.0,F,0.001811,black,0,0,0,0
8,200065,2.461538,2.0,168.538462,41.730769,3.419231,138.769231,3.923077,9.230769,37.330769,...,1.02,,3.0,F,45.112386,white,0,0,0,1
10,200077,3.0,2.0,203.0,65.153846,3.961538,138.384615,4.653846,8.769231,11.246154,...,1.02,,3.0,M,75.27265,unknown,1,0,0,0
13,200109,2.6,1.0,113.461538,10.0,0.469231,138.307692,3.730769,10.515385,9.138462,...,1.02,,3.0,M,71.611182,unknown,1,0,0,1


In [63]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'pedaledema',
       'appetite_median', 'gender', 'admission_age', 'ethnicity_grouped',
       'label_cor_art', 'diabetes_mellitus', 'ckd', 'anemia_flag'],
      dtype='object')

#### Dropping rows which have ethnicity_grouped "middle_eastern", "portuguese", "alaska_native", "pacific_islander"

In [64]:
ethnicities_to_drop = ["middle_eastern", "portuguese", "alaska_native", "pacific_islander"]
merged_table = merged_table[~merged_table['ethnicity_grouped'].isin(ethnicities_to_drop)]

In [65]:
merged_table['ckd_ethnicity'].value_counts()

0white       1894
0unknown      524
0black        251
1white        127
0hispanic     104
0asian         95
1unknown       14
1black         11
1hispanic       2
1asian          2
Name: ckd_ethnicity, dtype: int64

Removing pedaledema because it has 8365 missing values out of total 8405 rows

In [66]:
merged_table=merged_table.drop('pedaledema',axis=1)

In [67]:
merged_table.isna().sum().sum()

0

#### Grouping Ages 

In [68]:
age_ranges = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 400]

age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']

merged_table['age_group'] = pd.cut(merged_table['admission_age'], bins=age_ranges, labels=age_labels, right=False)

In [69]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'appetite_median',
       'gender', 'admission_age', 'ethnicity_grouped', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag', 'age_group'],
      dtype='object')

In [70]:
merged_table.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,...,specificgravity,appetite_median,gender,admission_age,ethnicity_grouped,label_cor_art,diabetes_mellitus,ckd,anemia_flag,age_group
3,200045,2.5,2.0,74.846154,23.923077,0.861538,147.461538,4.623077,10.369231,14.730769,...,1.02,3.0,F,73.941807,white,0,1,0,0,70-79
4,200046,2.6,2.0,125.0,29.0,1.0,138.384615,5.023077,16.392308,3.403846,...,1.02,3.0,F,0.001811,black,0,0,0,0,0-9
8,200065,2.461538,2.0,168.538462,41.730769,3.419231,138.769231,3.923077,9.230769,37.330769,...,1.02,3.0,F,45.112386,white,0,0,0,1,40-49
10,200077,3.0,2.0,203.0,65.153846,3.961538,138.384615,4.653846,8.769231,11.246154,...,1.02,3.0,M,75.27265,unknown,1,0,0,0,70-79
13,200109,2.6,1.0,113.461538,10.0,0.469231,138.307692,3.730769,10.515385,9.138462,...,1.02,3.0,M,71.611182,unknown,1,0,0,1,70-79


In [71]:
merged_table['age_group']

3       70-79
4         0-9
8       40-49
10      70-79
13      70-79
        ...  
8390    60-69
8394      0-9
8399    50-59
8400    50-59
8401      0-9
Name: age_group, Length: 3024, dtype: category
Categories (10, object): ['0-9' < '10-19' < '20-29' < '30-39' ... '60-69' < '70-79' < '80-89' < '90+']

In [72]:
merged_table=merged_table.drop('admission_age',axis=1)

#### Train Test Split

In [73]:
X= merged_table.drop(['ckd','ckd_ethnicity'],axis=1)
y=merged_table['ckd_ethnicity']

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [75]:
y_train.value_counts()

0white       1421
0unknown      393
0black        188
1white         95
0hispanic      78
0asian         71
1unknown       11
1black          8
1asian          2
1hispanic       1
Name: ckd_ethnicity, dtype: int64

In [76]:
y_test.value_counts()

0white       473
0unknown     131
0black        63
1white        32
0hispanic     26
0asian        24
1unknown       3
1black         3
1hispanic      1
Name: ckd_ethnicity, dtype: int64

#### Random Under Sampling because of white 

In [77]:
rus = RandomUnderSampler(sampling_strategy={'0white':95,'0unknown':11,'0asian':2,'0hispanic':2,'0black':8})
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)

In [78]:
 y_train_rus.value_counts()

0white       95
1white       95
0unknown     11
1unknown     11
0black        8
1black        8
0asian        2
0hispanic     2
1asian        2
1hispanic     1
Name: ckd_ethnicity, dtype: int64

In [79]:
X_train_rus.shape

(235, 28)

#### sepearating ckd_ethnicity again now that it has been stratified 

In [80]:
X_train_rus.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'gender',
       'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'anemia_flag', 'age_group'],
      dtype='object')

In [81]:
X_train_rus = X_train_rus.merge(merged_table[['icustay_id', 'ckd']], on='icustay_id', how='inner')

In [82]:
X_train_rus.shape

(235, 29)

In [83]:
X_train_rus.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'gender',
       'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'anemia_flag', 'age_group', 'ckd'],
      dtype='object')

In [84]:
y_train_rus= X_train_rus['ckd']
X_train_rus= X_train_rus.drop('ckd',axis=1)

In [85]:
X_train_rus=X_train_rus.drop('icustay_id',axis=1)

#### Encoding

In [86]:
X_onehot_train = pd.get_dummies(X_train_rus)

In [87]:
X_onehot_train.columns

Index(['albumin', 'bacteria', 'glucose', 'bun', 'creatinine', 'sodium',
       'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet', 'ptt',
       'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2',
       'specificgravity', 'appetite_median', 'label_cor_art',
       'diabetes_mellitus', 'anemia_flag', 'gender_F', 'gender_M',
       'ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white', 'age_group_0-9', 'age_group_10-19',
       'age_group_20-29', 'age_group_30-39', 'age_group_40-49',
       'age_group_50-59', 'age_group_60-69', 'age_group_70-79',
       'age_group_80-89', 'age_group_90+'],
      dtype='object')

#### Random Forest Model

In [88]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [89]:
rf = RandomForestClassifier()

In [90]:
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_onehot_train, y_train_rus)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 5, 10],
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]},
             scoring='f1')

In [91]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)

In [92]:
grid_search.best_score_

0.7914719915050837

#### Using best model with cross validation with ethnicity

In [94]:
rf_same = RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)
cv_scores = cross_val_score(rf_same, X_onehot_train, y_train_rus, cv=5, scoring='f1')
cv_scores.mean()

0.7655130988565639

#### Now same model, without ethnicity 

In [95]:
rf_same = RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)

In [96]:
X_train_no_eth= X_onehot_train.drop(['ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white'],axis=1)

In [97]:
X_train_no_eth.columns

Index(['albumin', 'bacteria', 'glucose', 'bun', 'creatinine', 'sodium',
       'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet', 'ptt',
       'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2',
       'specificgravity', 'appetite_median', 'label_cor_art',
       'diabetes_mellitus', 'anemia_flag', 'gender_F', 'gender_M',
       'age_group_0-9', 'age_group_10-19', 'age_group_20-29',
       'age_group_30-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70-79', 'age_group_80-89',
       'age_group_90+'],
      dtype='object')

In [98]:
cv_scores = cross_val_score(rf_same, X_train_no_eth, y_train_rus, cv=5, scoring='f1')

In [99]:
cv_scores

array([0.7826087 , 0.85106383, 0.68      , 0.7755102 , 0.77966102])

In [100]:
cv_scores.mean()

0.7737687492940386

#### Test Score with ethnicity

In [101]:
rf_same.fit(X_onehot_train,y_train_rus)

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)

In [102]:
X_test_onehot=pd.get_dummies(X_test)

In [103]:
X_test_onehot.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'label_cor_art',
       'diabetes_mellitus', 'anemia_flag', 'gender_F', 'gender_M',
       'ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white', 'age_group_0-9', 'age_group_10-19',
       'age_group_20-29', 'age_group_30-39', 'age_group_40-49',
       'age_group_50-59', 'age_group_60-69', 'age_group_70-79',
       'age_group_80-89', 'age_group_90+'],
      dtype='object')

In [104]:
X_test_onehot = X_test_onehot.merge(merged_table[['icustay_id', 'ckd']], on='icustay_id', how='inner')

In [105]:
y_test= X_test_onehot['ckd']
X_test_onehot= X_test_onehot.drop('ckd',axis=1)

In [106]:
X_test_onehot=X_test_onehot.drop('icustay_id',axis=1)

In [107]:
y_pred=rf_same.predict(X_test_onehot)

In [108]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [109]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Precision: 0.14163090128755365
Recall: 0.8461538461538461
F1 Score: 0.2426470588235294
Accuracy: 0.7275132275132276


In [503]:
#Observation 188 means high false positive but very low false negative which is good, true positive are very high too

#The diagonal elements of the matrix represent the number of correctly classified samples (true positives and true negatives), while the off-diagonal elements represent the number of misclassified samples (false positives and false negatives).

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[530 188]
 [  6  32]]


#### Test score without ethnicity

In [506]:
X_test_onehot_noeth=X_test_onehot.drop(['ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white'],axis=1)

In [507]:
rf_same.fit(X_train_no_eth,y_train_rus)

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_split=10,
                       n_estimators=200)

In [508]:
y_pred_noeth=rf_same.predict(X_test_onehot_noeth)

In [509]:
precision = precision_score(y_test, y_pred_noeth)
recall = recall_score(y_test, y_pred_noeth)
f1 = f1_score(y_test, y_pred_noeth)
accuracy = accuracy_score(y_test, y_pred_noeth)

In [510]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Precision: 0.13777777777777778
Recall: 0.8157894736842105
F1 Score: 0.23574144486692014
Accuracy: 0.7341269841269841


In [511]:
cm = confusion_matrix(y_test,y_pred_noeth)
print(cm)

[[524 194]
 [  7  31]]


# Time Series

In [110]:
labs_windowed

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
0,218958,2132-08-24 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
1,218958,2132-08-25 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
2,218958,2132-08-25 08:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
3,218958,2132-08-25 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
4,218958,2132-08-26 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
534941,224889,2114-03-14 17:00:00,26649.0,12.0,2.5,2.0,28.0,1.0,1.0,107.0,...,37.7,1.2,13.6,144.0,20.0,7.5,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534942,224889,2114-03-15 01:00:00,26649.0,11.0,2.5,2.0,27.0,1.0,0.9,107.0,...,36.9,1.2,13.3,142.0,20.0,6.4,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534943,224889,2114-03-15 09:00:00,26649.0,11.0,2.5,2.0,27.0,1.0,0.9,107.0,...,36.9,1.2,13.3,142.0,20.0,6.4,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534944,224889,2114-03-15 17:00:00,26649.0,9.0,2.5,2.0,30.0,1.0,0.9,108.0,...,36.9,1.2,13.3,144.0,20.0,5.6,2.0,0,2114-03-16 01:26:12,19 days 16:15:36


In [111]:
labs_windowed[labs_windowed['icustay_id']==218958]

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
0,218958,2132-08-24 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
1,218958,2132-08-25 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
2,218958,2132-08-25 08:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
3,218958,2132-08-25 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
4,218958,2132-08-26 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
5,218958,2132-08-26 08:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
6,218958,2132-08-26 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
7,218958,2132-08-27 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
8,218958,2132-08-27 08:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00
9,218958,2132-08-27 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,36.0,1.3,14.5,139.0,29.0,11.5,2.0,0,2132-08-28 17:07:00,10 days 00:34:00


In [112]:
labs_windowed[labs_windowed['icustay_id']==224889]

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
534933,224889,2114-03-12 01:00:00,26649.0,13.0,1.8,2.0,18.0,0.5,0.8,112.0,...,36.5,1.2,13.3,139.0,23.0,2.7,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534934,224889,2114-03-12 09:00:00,26649.0,16.0,1.8,2.0,17.0,0.5,0.85,109.5,...,36.5,1.2,13.3,137.5,20.5,4.45,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534935,224889,2114-03-12 17:00:00,26649.0,17.0,2.2,2.0,18.0,0.5,0.95,108.0,...,36.5,1.2,13.3,138.0,19.5,6.4,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534936,224889,2114-03-13 01:00:00,26649.0,15.0,2.2,2.0,20.0,0.7,1.0,105.0,...,68.7,1.9,17.1,135.0,20.0,8.8,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534937,224889,2114-03-13 09:00:00,26649.0,15.5,2.5,2.0,22.0,0.8,1.05,106.0,...,104.55,2.0,17.2,139.5,19.0,7.5,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534938,224889,2114-03-13 17:00:00,26649.0,15.5,2.5,2.0,22.0,0.8,1.05,106.0,...,51.8,1.4,14.5,139.5,19.0,7.1,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534939,224889,2114-03-14 01:00:00,26649.0,12.0,2.5,2.0,26.0,1.0,1.0,107.0,...,44.3,1.3,13.9,142.0,18.0,7.6,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534940,224889,2114-03-14 09:00:00,26649.0,12.0,2.5,2.0,26.0,1.0,1.0,107.0,...,44.3,1.3,13.9,142.0,18.0,7.6,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534941,224889,2114-03-14 17:00:00,26649.0,12.0,2.5,2.0,28.0,1.0,1.0,107.0,...,37.7,1.2,13.6,144.0,20.0,7.5,2.0,0,2114-03-16 01:26:12,19 days 16:15:36
534942,224889,2114-03-15 01:00:00,26649.0,11.0,2.5,2.0,27.0,1.0,0.9,107.0,...,36.9,1.2,13.3,142.0,20.0,6.4,2.0,0,2114-03-16 01:26:12,19 days 16:15:36


In [113]:
#Observation: 1 patient has approximately half of the data that we actually need
for icu_id in labs_windowed['icustay_id'].unique():
    icu_data =  labs_windowed[labs_windowed['icustay_id'] == icu_id]
    diff = icu_data['charttime'].max() - icu_data['charttime'].min()
    if diff < pd.Timedelta(days=3):
        print(f"ICU stay ID {icu_id}: {diff}")

ICU stay ID 286937: 2 days 16:00:00


In [114]:
#Observation: 3038 instances have a total of 4 days (exact) data in both labs and vitals
ct=0
for icu_id in vitals_windowed['icustay_id'].unique():
    icu_data_v =  vitals_windowed[vitals_windowed['icustay_id'] == icu_id]
    diff_v = icu_data_v['charttime'].max() - icu_data_v['charttime'].min()
    icu_data_l =  labs_windowed[labs_windowed['icustay_id'] == icu_id]
    diff_l = icu_data_l['charttime'].max() - icu_data_l['charttime'].min()
    if diff_v == pd.Timedelta(hours=96) and diff_l == pd.Timedelta(hours=96):
        ct+=1
print(ct)

3038


#### Filtering records that have exactly 4 days (96 hours) of data in both labs and vitals

In [115]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Filter the icustay_id where the difference is equal to 96 hours (4 days) in both labs and vitals
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=96)) & (vitals_diff == pd.Timedelta(hours=96))].index.tolist()

# Print the length of icustay_id
print(len(filtered_icustay_ids))

3038


In [116]:
filtered_labs_windowed = labs_windowed[labs_windowed['icustay_id'].isin(filtered_icustay_ids)]

In [117]:
filtered_labs_windowed.head()

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
41,241427,2151-11-07 01:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,36.0,1.3,14.5,138.0,29.0,11.5,2.0,0,2151-11-11 01:49:18,27 days 17:57:20
42,241427,2151-11-07 09:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,36.0,1.3,14.5,138.0,29.0,11.5,2.0,0,2151-11-11 01:49:18,27 days 17:57:20
43,241427,2151-11-07 17:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,36.0,1.3,14.5,138.0,29.0,11.5,2.0,0,2151-11-11 01:49:18,27 days 17:57:20
44,241427,2151-11-08 01:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,36.0,1.3,14.5,138.0,29.0,11.5,2.0,0,2151-11-11 01:49:18,27 days 17:57:20
45,241427,2151-11-08 09:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,36.0,1.3,14.5,138.0,29.0,11.5,2.0,0,2151-11-11 01:49:18,27 days 17:57:20


In [118]:
filtered_labs_windowed.columns

Index(['icustay_id', 'charttime', 'subject_id', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria', 'ckd', 'predtime',
       'delta_t_pred'],
      dtype='object')

#### Grouping data

In [119]:
feature_labs= ['aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']

In [120]:
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]

In [121]:
grouped_data_labs.head()

Unnamed: 0,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,glucose,hematocrit,hemoglobin,...,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd
305206,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305207,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305208,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305209,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305210,11.0,2.5,3.0,24.0,3.2,0.8,118.0,18.0,30.1,10.2,...,239.0,4.8,26.5,1.0,11.8,148.0,20.0,20.8,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164145,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0
164146,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0
164147,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0
164148,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0


In [122]:
# Concatenate groups into a NumPy array
data = np.concatenate([group.values for _, group in grouped_data_labs])

# Extract features (X) and target (y)
X = data[:, :-1]  # Exclude the last column 'ckd'
y = data[:, -1]   # Select only the last column 'ckd'

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [123]:
X

array([[18. ,  2.5,  3. , ..., 14. , 15.4,  2. ],
       [18. ,  2.5,  3. , ..., 14. , 15.4,  2. ],
       [18. ,  2.5,  3. , ..., 14. , 15.4,  2. ],
       ...,
       [15. ,  2.6,  3. , ..., 29. , 15.7,  2. ],
       [15. ,  2.6,  3. , ..., 29. , 15.7,  2. ],
       [15. ,  2.6,  3. , ..., 29. , 15.7,  2. ]])

In [124]:
X.shape

(39494, 20)

In [125]:
y.shape

(39494,)

In [126]:
np.unique(y)

array([0., 1.])

In [127]:
X_train.shape

(31595, 20)

In [128]:
y_train.shape

(31595,)

#### Rocket for lab (working)

In [132]:
X = []
y = []
for _, group in grouped_data_labs:
    group_values = group[feature_labs].values.T
        
    X.append(group_values)
    y.append(group['ckd'].iloc[0])
    
X = np.array(X)
y = np.array(y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [133]:
X.shape

(3038, 20, 13)

In [134]:
y.shape

(3038,)

#### Random undersampling for labs

In [136]:
n_samples, n_features, n_channels = X.shape
X_2d = X.reshape((n_samples, n_features*n_channels))
rusLab = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rusLab.fit_resample(X_2d, y)
X_resampled = X_resampled.reshape((X_resampled.shape[0], n_features, n_channels))    
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [137]:
rocket = Rocket(num_kernels=100)
rocket.fit(X_train)
X_train_transformed = rocket.transform(X_train)
X_test_transformed = rocket.transform(X_test)

In [138]:
clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
    
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

Cross-validation scores: [0.56       0.66       0.6        0.64       0.55102041]
Mean cross-validation score: 0.6022040816326532


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [139]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross-validation scores: [0.56       0.62       0.62       0.66       0.55102041]
Mean cross-validation score: 0.6022040816326532
Accuracy: 0.6825396825396826

Classification report:
              precision    recall  f1-score   support

           0       0.72      0.64      0.68        33
           1       0.65      0.73      0.69        30

    accuracy                           0.68        63
   macro avg       0.69      0.68      0.68        63
weighted avg       0.69      0.68      0.68        63



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### with only albumin and creatinine

In [140]:
feature_labs= ['albumin','creatinine']
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]
grouped_data_labs.head()

Unnamed: 0,albumin,creatinine,ckd
305206,2.5,0.7,0
305207,2.5,0.7,0
305208,2.5,0.7,0
305209,2.5,0.7,0
305210,2.5,0.8,0
...,...,...,...
164145,2.6,1.0,0
164146,2.6,1.0,0
164147,2.6,1.0,0
164148,2.6,1.0,0


In [145]:
X = []
y = []
for _, group in grouped_data_labs:
    group_values = group[feature_labs].values.T
        
    X.append(group_values)
    y.append(group['ckd'].iloc[0])
    
X = np.array(X)
y = np.array(y)
    
n_samples, n_features, n_channels = X.shape
X_2d = X.reshape((n_samples, n_features*n_channels))

rusLabCrAl = RandomUnderSampler(random_state=42)
X_resampledCrAl, y_resampledCrAl = rusLabCrAl.fit_resample(X_2d, y)

X_resampledCrAl = X_resampledCrAl.reshape((X_resampledCrAl.shape[0], n_features, n_channels))    
X_train, X_test, y_train, y_test = train_test_split(X_resampledCrAl, y_resampledCrAl, test_size=0.2)

In [146]:
rocket = Rocket(num_kernels=100)
rocket.fit(X_train)
X_train_transformed = rocket.transform(X_train)
X_test_transformed = rocket.transform(X_test)

In [147]:
clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
    
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

Cross-validation scores: [0.62      0.58      0.62      0.7       0.6122449]
Mean cross-validation score: 0.6264489795918367


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [148]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

Cross-validation scores: [0.6       0.58      0.62      0.7       0.6122449]
Mean cross-validation score: 0.6224489795918368
Accuracy: 0.6984126984126984

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.68      0.75        41
           1       0.55      0.73      0.63        22

    accuracy                           0.70        63
   macro avg       0.69      0.71      0.69        63
weighted avg       0.73      0.70      0.71        63



#### Just albumin 

In [149]:
feature_labs= ['albumin']
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]


X = []
y = []
for _, group in grouped_data_labs:
    group_values = group[feature_labs].values.T
        
    X.append(group_values)
    y.append(group['ckd'].iloc[0])
    
X = np.array(X)
y = np.array(y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


rocket = Rocket(num_kernels=100)
rocket.fit(X_train)
X_train_transformed = rocket.transform(X_train)
X_test_transformed = rocket.transform(X_test)


clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
    
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores: [0.94444444 0.94855967 0.94855967 0.94238683 0.94444444]
Mean cross-validation score: 0.945679012345679


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [150]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

Cross-validation scores: [0.94444444 0.94855967 0.95061728 0.94650206 0.94238683]
Mean cross-validation score: 0.9465020576131685
Accuracy: 0.9391447368421053

Classification report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       571
           1       0.00      0.00      0.00        37

    accuracy                           0.94       608
   macro avg       0.47      0.50      0.48       608
weighted avg       0.88      0.94      0.91       608



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Time series on Vitals

In [151]:
vitals_windowed.head()

Unnamed: 0,icustay_id,charttime,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,glucose,rbc,specificgravity,pedaledema,appetite_median,ckd
0,218958,2132-08-24 16:00:00,140.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0
1,218958,2132-08-24 17:00:00,144.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0
2,218958,2132-08-24 18:00:00,150.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0
3,218958,2132-08-24 19:00:00,144.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0
4,218958,2132-08-24 20:00:00,126.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0


In [152]:
vv = vitals_windowed
vv['charttime'] = pd.to_datetime(vv['charttime'])
vv['time_diff'] = vv.groupby('icustay_id')['charttime'].diff().dt.total_seconds() / 3600
vv['time_diff'].fillna(0, inplace=True)

column_sum = vv.groupby('icustay_id')['time_diff'].sum()
column_sum_df = column_sum.to_frame().reset_index()
column_sum_df.columns = ['icustay_id', 'time_diff']
#filtered_df = vitals_windowed[vitals_windowed['icustay_id'] == 286937]
column_sum_df.describe()

Unnamed: 0,icustay_id,time_diff
count,8405.0,8405.0
mean,249984.620345,95.661273
std,28585.253432,1.806917
min,200017.0,71.0
25%,225578.0,95.0
50%,250040.0,96.0
75%,274430.0,96.0
max,299992.0,107.0


In [153]:
filtered_df = vitals_windowed[vitals_windowed['icustay_id'] == 286937]
filtered_df

Unnamed: 0,icustay_id,charttime,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,glucose,rbc,specificgravity,pedaledema,appetite_median,ckd,time_diff
397810,286937,2142-08-06 03:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,0.0
397811,286937,2142-08-06 04:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397812,286937,2142-08-06 05:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397813,286937,2142-08-06 06:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397814,286937,2142-08-06 07:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397877,286937,2142-08-08 22:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397878,286937,2142-08-08 23:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397879,286937,2142-08-09 00:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397880,286937,2142-08-09 01:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0


In [154]:
def RocketForVitals(vitals_windowed):
    vitals_windowed['charttime'] = pd.to_datetime(vitals_windowed['charttime'])
    feature_columns = ['heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity', 'appetite_median']
    
    grouped_data = vitals_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_columns + ['ckd']]
    
    X = []
    y = []
    for _, group in grouped_data:
        group_values = group[feature_columns].values.T
        num_timestamps = group_values.shape[1]
        
        if num_timestamps < WINDOW_LENGTH:
            padded_values = np.pad(group_values, ((0, 0), (0, WINDOW_LENGTH - num_timestamps)), 'constant', constant_values=0)
        elif num_timestamps > WINDOW_LENGTH:        
            padded_values = group_values[:, :WINDOW_LENGTH]
        else:        
            padded_values = group_values
        
        X.append(padded_values)
        y.append(group['ckd'].iloc[0])
    
    X = np.array(X)
    y = np.array(y)
    
    n_samples, n_features, n_channels = X.shape
    X_2d = X.reshape((n_samples, n_features*n_channels))

    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X_2d, y)

    X_resampled = X_resampled.reshape((X_resampled.shape[0], n_features, n_channels))    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("X_train shape: ",X_train.shape,"\ny_train shape: ",y_train.shape)
    
    rocket = Rocket(num_kernels=100, random_state=42)
    rocket.fit(X_train)
    X_train_transformed = rocket.transform(X_train)
    X_test_transformed = rocket.transform(X_test)
    return X_train_transformed, X_test_transformed,y_train,y_test

In [155]:
X_train_transformed, X_test_transformed, y_train, y_test = RocketForVitals(vitals_windowed)

X_train shape:  (859, 11, 96) 
y_train shape:  (859,)


In [156]:
X_train_transformed.shape, y_train.shape

((859, 200), (859,))

In [157]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

In [158]:
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores: [0.61046512 0.56976744 0.5755814  0.68023256 0.59064327]
Mean cross-validation score: 0.6053379572963415
Accuracy: 0.6232558139534884

Classification report:
              precision    recall  f1-score   support

           0       0.60      0.57      0.59       101
           1       0.64      0.67      0.65       114

    accuracy                           0.62       215
   macro avg       0.62      0.62      0.62       215
weighted avg       0.62      0.62      0.62       215



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Multimodal Approach

####  labs and vitals

In [365]:
# LABS
feature_labs = [ 'albumin', 'bicarbonate', 'creatinine', 'chloride', 'hematocrit', 'hemoglobin', 'potassium', 'sodium', 'bun', 'wbc', 'bacteria']
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]

# VITALS
feature_columns = [ 'sysbp', 'diasbp', 'meanbp', 'spo2', 'glucose', 'rbc', 'specificgravity', 'appetite_median']
grouped_data_vitals = vitals_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_columns + ['ckd']]

# Get unique icustay_id values
icustay_ids = grouped_data_labs.groups.keys()

# Split icustay_id into training and test sets
train_size = int(0.8 * len(icustay_ids))
train_ids = list(icustay_ids)[:train_size]
test_ids = list(icustay_ids)[train_size:]
test_ids.remove(278494)

# Prepare the data for training (Labs)
X_train_labs = []
X_test_labs = []
y_train_labs = []
y_test_labs = []

for icustay_id, group in grouped_data_labs:
    if icustay_id in train_ids:
        lab_values = group[feature_labs].values.T
        X_train_labs.append(lab_values)
        y_train_labs.append(group['ckd'].iloc[0])
    elif icustay_id in test_ids:
        lab_values = group[feature_labs].values.T
        X_test_labs.append(lab_values)
        y_test_labs.append(group['ckd'].iloc[0])

X_train_labs = np.array(X_train_labs)
X_test_labs = np.array(X_test_labs)
y_train_labs = np.array(y_train_labs)
y_test_labs = np.array(y_test_labs)

n_samples, n_features, n_channels = X_train_labs.shape
X_2d = X_train_labs.reshape((n_samples, n_features * n_channels))
rus = RandomUnderSampler(random_state=42)
X_resampled_labs, y_resampled_labs = rus.fit_resample(X_2d, y_train_labs)
X_resampled_labs = X_resampled_labs.reshape((X_resampled_labs.shape[0], n_features, n_channels))
X_train_labs = X_resampled_labs
y_train_labs = y_resampled_labs

# Prepare the data for training (Vitals)
X_train_vitals = []
X_test_vitals = []
y_train_vitals = []
y_test_vitals = []

for icustay_id, group in grouped_data_vitals:
    if icustay_id in train_ids:
        vital_values = group[feature_columns].values.T
        X_train_vitals.append(vital_values)
        y_train_vitals.append(group['ckd'].iloc[0])
    elif icustay_id in test_ids:
        vital_values = group[feature_columns].values.T
        X_test_vitals.append(vital_values)
        y_test_vitals.append(group['ckd'].iloc[0])

X_train_vitals = np.array(X_train_vitals)
X_test_vitals = np.array(X_test_vitals)
y_train_vitals = np.array(y_train_vitals)
y_test_vitals = np.array(y_test_vitals)

n_samples, n_features, n_channels = X_train_vitals.shape
X_2d = X_train_vitals.reshape((n_samples, n_features * n_channels))
rus = RandomUnderSampler(random_state=42)
X_resampled_vitals, y_resampled_vitals = rus.fit_resample(X_2d, y_train_vitals)
X_resampled_vitals = X_resampled_vitals.reshape((X_resampled_vitals.shape[0], n_features, n_channels))
X_train_vitals = X_resampled_vitals
y_train_vitals = y_resampled_vitals

In [366]:
#Rocket and LR


#Labs
rocket_lab = Rocket(num_kernels=100, random_state=42)
rocket_lab.fit(X_train_labs)
X_train_transformed_labs = rocket_lab.transform(X_train_labs)
X_test_transformed_labs = rocket_lab.transform(X_test_labs) 
clf_lab = LogisticRegression(random_state=42, max_iter=1000)
clf_lab.fit(X_train_transformed_labs,y_train_labs)

#Vitals
rocket_vital = Rocket(num_kernels=100, random_state=42)
rocket_vital.fit(X_train_vitals)
X_train_transformed_vital = rocket_vital.transform(X_train_vitals)
X_test_transformed_vital = rocket_vital.transform(X_test_vitals)

clf_vital = LogisticRegression(random_state=42, max_iter=1000)
clf_vital.fit(X_train_transformed_vital,y_train_vitals)

LogisticRegression(max_iter=1000, random_state=42)

#### Static model

In [388]:
# Split the merged_table into train and test data
train_data = merged_table.loc[merged_table['icustay_id'].isin(train_ids)]
test_data = merged_table.loc[merged_table['icustay_id'].isin(test_ids)]

In [389]:
test_ids_not_in_table = set(test_ids) - set(merged_table['icustay_id'])
print(test_ids_not_in_table)

set()


In [390]:
test_data.shape,len(test_ids)

((607, 30), 607)

In [391]:
train_data.shape

(2417, 30)

In [392]:
train_data.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'appetite_median',
       'gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'ckd', 'anemia_flag', 'age_group'],
      dtype='object')

In [393]:
X_train_static=train_data.drop(['icustay_id','ckd','platelet',
       'ptt', 'ckd_ethnicity', 'heartrate',
       'resprate', 'tempc'],axis=1)
y_train_static=train_data['ckd']
X_train_static=pd.get_dummies(X_train_static)

In [394]:
X_test_static=test_data.drop(['icustay_id','ckd','platelet',
       'ptt', 'ckd_ethnicity', 'heartrate',
       'resprate', 'tempc'],axis=1)
y_test_static=test_data['ckd']
X_test_static=pd.get_dummies(X_test_static)

In [395]:
rus = RandomUnderSampler(random_state=42)
X_resampled_static, y_resampled_static = rus.fit_resample(X_train_static, y_train_static)
#X_resampled_vitals = X_resampled_vitals.reshape((X_resampled_vitals.shape[0], n_features, n_channels))
X_train_static = X_resampled_static
y_train_static = y_resampled_static
rf_same.fit(X_train_static,y_train_static)

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)

In [396]:
y_train_static.shape,y_train_labs.shape

((262,), (262,))

In [417]:
def soft_voting(clf_lab, clf_vital,clf3, weights, X_test_transformed_lab, X_test_transformed_vital, y_test_labs,y_test_vitals,Xstatic,ystatic):
    # Obtain the probability estimates for each class
    prob_lab = clf_lab.predict_proba(X_test_transformed_lab)
    prob_vital = clf_vital.predict_proba(X_test_transformed_vital)
    prob_static = rf_same.predict_proba(Xstatic)

    # Combine the probability estimates using weighted averaging
    weighted_prob = (weights[0] * prob_lab + weights[1] * prob_vital  +  weights[2] * prob_static) / np.sum(weights)
    #print(weighted_prob)

    # Make the final prediction based on the highest probability
    y_pred = np.argmax(weighted_prob, axis=1)

    # Calculate accuracy
    acc = np.sum(y_pred == y_test_labs)/len(y_pred)
    print(acc)

    return y_pred

#### Obtaining Weights

In [399]:
def printEvaluationScores(X_train_static, y_train_static):
    cv_scores = cross_val_score(rf_same, X_train_static, y_train_static, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
printEvaluationScores(X_train_static, y_train_static)

Cross-validation scores: [0.83018868 0.66037736 0.84615385 0.86538462 0.76923077]
Mean cross-validation score: 0.7942670537010159


In [377]:
def printEvaluationScores(X_train_transformed_labs, y_train_labs):
    cv_scores = cross_val_score(clf_lab, X_train_transformed_labs, y_train_labs, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
printEvaluationScores(X_train_transformed_labs, y_train_labs)

Cross-validation scores: [0.58490566 0.49056604 0.67307692 0.53846154 0.65384615]
Mean cross-validation score: 0.5881712626995645


In [378]:
def printEvaluationScores(X_train_transformed_vital, y_train_vitals):
    cv_scores = cross_val_score(clf_vital, X_train_transformed_vital, y_train_vitals, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
printEvaluationScores(X_train_transformed_vital, y_train_vitals)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Cross-validation scores: [0.67924528 0.69811321 0.40384615 0.57692308 0.67307692]
Mean cross-validation score: 0.6062409288824384


In [400]:
weight_l=0.58/(1-0.58)
weight_v=0.60/(1-0.60)
weight_s=0.79/(1-0.79)

In [419]:
soft_voting(clf_lab=clf_lab, clf_vital=clf_vital,clf3=rf_same, weights=[6,6,8],X_test_transformed_lab=X_test_transformed_labs, X_test_transformed_vital=X_test_transformed_vital, y_test_labs=y_test_labs, y_test_vitals=y_test_vitals,Xstatic=X_test_static,ystatic=y_test_static)

0.642504118616145


array([1, 0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0,
       0, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 0,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0,