In [1]:
import psycopg2
from datetime import timedelta
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sktime.transformations.panel.rocket import Rocket
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

In [2]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="mimic",
    user="postgres",
    password="postgres"
)

cur = conn.cursor()

cur.execute("SELECT version();")
print(cur.fetchone())

('PostgreSQL 15.2, compiled by Visual C++ build 1914, 64-bit',)


In [3]:
# Connect to db
conn = psycopg2.connect(host='localhost', dbname='mimic', user='postgres', password='postgres', options='-c search_path=mimiciii')
#conn = psycopg2.connect(dbname='mimic', user='postgres')
cur = conn.cursor() 

# Read in table with patients & admissions (inner join on subject_id) and icu_stays (inner joinon subject_id and hadm_id)
icustay_details = pd.read_sql_query("SELECT * FROM mimiciii.flicu_icustay_detail;", conn)

# Read in vital and lab signs
pivoted_vital = pd.read_sql_query("SELECT * FROM mimiciii.pivoted_vital;", conn)
pivoted_lab = pd.read_sql_query("SELECT * FROM mimiciii.ckd_pivoted_lab;", conn)

# Read in lab measurements
# Use flicu_pivoted_lab (as it only takes the lab tests during ICU stay)
#query = "SELECT * FROM mimiciii.flicu_pivoted_lab;"
# Alternative:  Use the lab values recorded previous to the ICU stay (although during same hospital admission!), 
# then sample them  (8h intervalls) and then forward fill plus cap at either icu admission time of first vital sign recorded
#query = "SELECT * FROM mimiciii.pivoted_lab;"
#pivoted_lab = pd.read_sql_query(query, conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()



In [4]:
pivoted_vital['pedaledema'].unique()

array([nan,  3.,  2.])

In [5]:
pivoted_vital.shape

(9207039, 15)

In [6]:
pivoted_vital.columns

Index(['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity',
       'pedaledema', 'appetite_median', 'ckd'],
      dtype='object')

In [7]:
pivoted_lab.columns

Index(['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria', 'ckd'],
      dtype='object')

In [8]:
icustay_details.describe()

Unnamed: 0,subject_id,hadm_id,icustay_id,los_hospital,admission_age,hospital_expire_flag,hospstay_seq,los_icu,icustay_seq,label_death_icu,label_cor_art,diabetes_mellitus,ckd,anemia_flag
count,61051.0,61051.0,61051.0,61051.0,61051.0,61051.0,61051.0,61041.0,61051.0,61051.0,61051.0,61051.0,61051.0,61051.0
mean,33961.698989,149946.928945,249968.598696,11.320283,64.856674,0.107975,1.418568,4.931644,1.070908,0.073774,0.212838,0.170693,0.082849,0.125682
std,28153.637888,28899.070114,28891.923533,14.301661,56.970061,0.310352,1.510997,9.664428,0.301838,0.261406,0.409318,0.376244,0.275656,0.331493
min,2.0,100001.0,200001.0,-0.945139,7e-06,0.0,1.0,0.000139,1.0,0.0,0.0,0.0,0.0,0.0
25%,12085.5,124949.0,224951.0,3.910069,44.281191,0.0,1.0,1.109491,1.0,0.0,0.0,0.0,0.0,0.0
50%,24352.0,149883.0,249949.0,6.945833,62.054949,0.0,1.0,2.094815,1.0,0.0,0.0,0.0,0.0,0.0
75%,54366.0,174997.5,274974.5,13.059722,76.068514,0.0,1.0,4.502199,1.0,0.0,0.0,0.0,0.0,0.0
max,99999.0,199999.0,299999.0,294.660417,311.561027,1.0,41.0,173.072512,7.0,1.0,1.0,1.0,1.0,1.0


#### Setting window length 

In [9]:
WINDOW_LENGTH = 24*2

### Keeping records that are atleast window length

In [10]:
data= icustay_details.copy()
data = data[data.los_icu >= WINDOW_LENGTH/24.0]

In [11]:
filtered_icustay_ids = pd.DataFrame(data['icustay_id'].unique(), columns=['icustay_id'])

In [12]:
# Drop measurements with no belonging icustay_id
pivoted_vital = pivoted_vital.dropna(subset=['icustay_id'])
pivoted_lab = pivoted_lab.dropna(subset=['icustay_id'])

#check the shape 
print(pivoted_vital.shape)

# Cast icustay_id types to int
pivoted_vital['icustay_id'] = pivoted_vital['icustay_id'].astype(int)
pivoted_lab['icustay_id'] = pivoted_lab['icustay_id'].astype(int)

print(len(pivoted_vital['icustay_id'].unique()))

# Keep only values of patients in previously filtered icustay_ids in labs and vitals
pivoted_vital = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
pivoted_lab = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print(len(pivoted_vital['icustay_id'].unique()))

(9207039, 15)
60491
32100


In [13]:
pivoted_vital.columns

Index(['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity',
       'pedaledema', 'appetite_median', 'ckd'],
      dtype='object')

In [14]:
print(pivoted_vital.shape)

(8131206, 15)


In [15]:
len(pivoted_vital['icustay_id'].unique())

32100

In [16]:
len(pivoted_vital['icustay_id'])

8131206

In [17]:
# Min of each lab and vitals
icustay_ids_charttime_min_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
icustay_ids_charttime_min_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
# Min of both combined
icustay_ids_charttime_min_vital_lab = pd.concat([icustay_ids_charttime_min_lab, icustay_ids_charttime_min_vital], ignore_index=True)
icustay_ids_charttime_min_vital_lab = icustay_ids_charttime_min_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_min_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]

# Max of each lab and vitals
icustay_ids_charttime_max_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
icustay_ids_charttime_max_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
# Max of both combined
icustay_ids_charttime_max_vital_lab = pd.concat([icustay_ids_charttime_max_lab, icustay_ids_charttime_max_vital], ignore_index=True)
icustay_ids_charttime_max_vital_lab = icustay_ids_charttime_max_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_max_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]


In [18]:
# Find for which icustay_ids there exist at least WINDOW_LENGTH of data
icustay_ids_vital_lab_charttime_min_max = pd.concat([icustay_ids_charttime_max_vital_lab, icustay_ids_charttime_min_vital_lab], ignore_index=True)
time_window = timedelta(days=4, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
is_time_diff_bigger_window_lab = icustay_ids_vital_lab_charttime_min_max.groupby(['icustay_id'])['charttime'].transform(lambda x: (x.max()-x.min())) >= time_window

icustay_ids_vital_lab_charttime_min_max_filtered = icustay_ids_vital_lab_charttime_min_max[is_time_diff_bigger_window_lab]
print("Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering", icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].nunique())

# Keep only icustay ids for which at least WINDOW_LENGTH of data exists
icustay_ids_time_filtered = pd.DataFrame(icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].unique(), columns=['icustay_id'])
print("Unique icu stays in icustay_ids_time_filtered: ", icustay_ids_time_filtered['icustay_id'].nunique())

Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering 11276
Unique icu stays in icustay_ids_time_filtered:  11276


In [19]:
filtered_icustay_ids = filtered_icustay_ids.merge(icustay_ids_time_filtered, on='icustay_id', how='inner').drop_duplicates()

In [20]:
demographics_filtered = data.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays demographics: ", demographics_filtered['icustay_id'].nunique())

vital_filtered = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays vitals: ", vital_filtered['icustay_id'].nunique())

lab_filtered = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays labs: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays demographics:  11276
Number of ICU stays vitals:  11276
Number of ICU stays labs:  11276


In [21]:
demographics_filtered.columns

Index(['subject_id', 'hadm_id', 'icustay_id', 'gender', 'dod', 'admittime',
       'dischtime', 'los_hospital', 'admission_age', 'ethnicity',
       'ethnicity_grouped', 'hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq',
       'first_icu_stay_current_hosp', 'first_icu_stay_patient',
       'first_careunit', 'deathtime_icu', 'label_death_icu', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag'],
      dtype='object')

In [22]:
lab_filtered.columns

Index(['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria', 'ckd'],
      dtype='object')

In [23]:
lab_filtered['icustay_id'].unique()

array([228400, 218958, 241427, ..., 241017, 270667, 224889], dtype=int64)

In [24]:
vital_filtered = vital_filtered.merge(lab_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", vital_filtered['icustay_id'].nunique())
lab_filtered = lab_filtered.merge(vital_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays in lab_filtered:  11276
Number of ICU stays in lab_filtered:  11276


In [25]:
vital_resampled = vital_filtered.copy()

# Resample from the end of the time series (how="last")
vital_resampled = vital_resampled.assign(charttime=vital_resampled.charttime.dt.round('H'))
#vital_resampled = vital_resampled.set_index('charttime').groupby('icustay_id').resample('1H', origin="end").median().drop(['icustay_id'], axis = 1).reset_index()
# Resample from the beginning of the time series
vital_resampled = vital_resampled.set_index('charttime').groupby('icustay_id').resample('1H', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use lambda function instead of directly applying it to groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1)
vital_col = vital_resampled.columns.drop(['icustay_id', 'charttime'])
vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()
#.fillna(value=vital_resampled[vital_columns].mean())#.fillna(0)#.fillna(-1)


  vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()


In [26]:
lab_resampled = lab_filtered.copy()
# Cut out minutes and hours, so that the resampling of the 8h takes the same time span as the 1h samples (for vitals)
lab_resampled = lab_resampled.assign(charttime=lab_resampled.charttime.dt.round('H'))
# Resample from the end of the time series 
#lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="end").median().drop(['icustay_id'], axis = 1).reset_index()
lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use transform instead of direct groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1 or 0 or mean!?)
lab_col = lab_resampled.columns.drop(['icustay_id', 'charttime'])
lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()

print(lab_resampled.isnull().sum().sum())

  lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()


811


### keep only uptil 4 days data

In [93]:
delta_t_data = timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
demographics_windowed = demographics_filtered.copy()
demographics_windowed['predtime'] = demographics_windowed.intime + delta_t_data
demographics_windowed['delta_t_pred'] = demographics_windowed.outtime - demographics_windowed.predtime

demographics_windowed[['subject_id', 'icustay_id', 'intime', 'predtime', 'delta_t_pred']].head(5)

Unnamed: 0,subject_id,icustay_id,intime,predtime,delta_t_pred
0,58817,228400,2118-05-02 06:24:29,2118-05-04 06:24:29,4 days 06:51:19
1,14137,218958,2132-08-24 17:07:00,2132-08-26 17:07:00,12 days 00:34:00
2,28970,241427,2151-11-07 01:49:18,2151-11-09 01:49:18,29 days 17:57:20
3,2269,272085,2187-12-04 11:35:04,2187-12-06 11:35:04,14 days 02:08:02
4,334,214236,2136-01-16 10:56:48,2136-01-18 10:56:48,12 days 07:21:18


In [94]:
cut_icustay_ids = pd.DataFrame(demographics_windowed['icustay_id'].unique(), columns=['icustay_id'])
print("Number of ICU stays: ", cut_icustay_ids['icustay_id'].count())

vitals_cut = vital_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in vitals_cut: ", vitals_cut['icustay_id'].nunique())

labs_cut = lab_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in labs_cut: ", labs_cut['icustay_id'].nunique())


Number of ICU stays:  11276
Number of ICU stays in vitals_cut:  11276
Number of ICU stays in labs_cut:  11276


In [95]:
print(delta_t_data)

2 days, 0:00:00


In [96]:
vitals_windowed = vital_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
vitals_windowed = vitals_windowed[vitals_windowed.charttime < vitals_windowed.predtime]
print("Number of ICU stays in vitals_windowed: ", vitals_windowed['icustay_id'].nunique())

labs_windowed = lab_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
labs_windowed = labs_windowed[labs_windowed.charttime < labs_windowed.predtime]
print("Number of ICU stays in labs_windowed: ", labs_windowed['icustay_id'].nunique())

windowed_icustay_ids = pd.DataFrame(pd.concat([vitals_windowed['icustay_id'], labs_windowed['icustay_id']]).unique(), columns=['icustay_id'])
demographics_windowed = demographics_windowed.merge(windowed_icustay_ids, on='icustay_id', how='right')

Number of ICU stays in vitals_windowed:  11271
Number of ICU stays in labs_windowed:  11271


In [97]:
labs_windowed.isna().sum()

icustay_id        0
charttime         0
subject_id        0
aniongap          0
albumin           0
bands             0
bicarbonate       0
bilirubin         0
creatinine        0
chloride          0
glucose           0
hematocrit        0
hemoglobin        0
lactate           0
platelet          0
potassium         0
ptt               0
inr               0
pt                0
sodium            0
bun               0
wbc               0
bacteria          0
ckd             112
predtime          0
delta_t_pred      0
dtype: int64

In [98]:
vitals_windowed.isna().sum()

icustay_id              0
charttime               0
heartrate               0
sysbp                   0
diasbp                  0
meanbp                  0
resprate                0
tempc                   0
spo2                    0
glucose                 0
rbc                     0
specificgravity         0
pedaledema         548170
appetite_median         0
ckd                   895
predtime                0
delta_t_pred            0
dtype: int64

#### using icustay_id from demographics to fill missing ckd in vitals and labs

In [99]:
vitals_windowed['ckd'] = vitals_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [100]:
vitals_windowed.isna().sum()

icustay_id              0
charttime               0
heartrate               0
sysbp                   0
diasbp                  0
meanbp                  0
resprate                0
tempc                   0
spo2                    0
glucose                 0
rbc                     0
specificgravity         0
pedaledema         548170
appetite_median         0
ckd                     0
predtime                0
delta_t_pred            0
dtype: int64

In [101]:
labs_windowed['ckd'] = labs_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [102]:
labs_windowed.isna().sum()

icustay_id      0
charttime       0
subject_id      0
aniongap        0
albumin         0
bands           0
bicarbonate     0
bilirubin       0
creatinine      0
chloride        0
glucose         0
hematocrit      0
hemoglobin      0
lactate         0
platelet        0
potassium       0
ptt             0
inr             0
pt              0
sodium          0
bun             0
wbc             0
bacteria        0
ckd             0
predtime        0
delta_t_pred    0
dtype: int64

#### Some patients might not have any value for pedaledema and hence we are filling those with -1

In [103]:
demographics_windowed.isna().sum()

subject_id                        0
hadm_id                           0
icustay_id                        0
gender                            0
dod                            6102
admittime                         0
dischtime                         0
los_hospital                      0
admission_age                     0
ethnicity                         0
ethnicity_grouped                 0
hospital_expire_flag              0
hospstay_seq                      0
first_hosp_stay                   0
intime                            0
outtime                           0
los_icu                           0
icustay_seq                       0
first_icu_stay_current_hosp       0
first_icu_stay_patient            0
first_careunit                    0
deathtime_icu                  9800
label_death_icu                   0
label_cor_art                     0
diabetes_mellitus                 0
ckd                               0
anemia_flag                       0
predtime                    

In [104]:
vitals_windowed =vitals_windowed.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(-1).reset_index()

In [105]:
vitals_windowed.isna().sum()

icustay_id         0
charttime          0
heartrate          0
sysbp              0
diasbp             0
meanbp             0
resprate           0
tempc              0
spo2               0
glucose            0
rbc                0
specificgravity    0
pedaledema         0
appetite_median    0
ckd                0
dtype: int64

In [106]:
print("Number of ICU stays demographics: ", demographics_windowed['icustay_id'].nunique())
print("Number of CKD demographics:")
dd = demographics_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

print("Number of ICU stays vitals: ", vitals_windowed['icustay_id'].nunique())
print("Number of CKD vitals:")
dd = vitals_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

print("Number of ICU stays labs: ", labs_windowed['icustay_id'].nunique())
print("Number of CKD labs:")
dd = labs_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

Number of ICU stays demographics:  11271
Number of CKD demographics:
0    10457
1      814
Name: ckd, dtype: int64
Number of ICU stays vitals:  11271
Number of CKD vitals:
0    10457
1      814
Name: ckd, dtype: int64
Number of ICU stays labs:  11271
Number of CKD labs:
0    10457
1      814
Name: ckd, dtype: int64


# static model- Random Forest

In [107]:
def aggregate_dataframe(df, groupby_key, columns_to_aggregate):
    df = df.replace(-1, np.nan)
    result = df.groupby(groupby_key)[columns_to_aggregate].mean().reset_index()    
    return result

In [108]:
columns_to_merge = ['icustay_id', 'ckd','ethnicity_grouped']

In [109]:
df_cols_vitals = ['heartrate', 'sysbp','diasbp','meanbp','resprate','tempc','spo2','specificgravity','pedaledema','appetite_median']
df_agg_vitals = aggregate_dataframe(vitals_windowed, 'icustay_id', df_cols_vitals)

df_agg_vitals = df_agg_vitals.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_vitals['ckd_ethnicity'] = df_agg_vitals['ckd'].astype(str).str.cat(df_agg_vitals['ethnicity_grouped'].astype(str))

In [110]:
df_cols_labs = ['albumin','bacteria','glucose','bun','creatinine','sodium','potassium','hemoglobin','wbc','hematocrit','platelet','ptt']
df_agg_labs = aggregate_dataframe(labs_windowed, 'icustay_id', df_cols_labs)

df_agg_labs = df_agg_labs.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_labs['ckd_ethnicity'] = df_agg_labs['ckd'].astype(str).str.cat(df_agg_labs['ethnicity_grouped'].astype(str))

In [111]:
df_agg_vitals.head()

Unnamed: 0,icustay_id,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,specificgravity,pedaledema,appetite_median,ckd,ethnicity_grouped,ckd_ethnicity
0,200017,151.041667,120.0,59.0,78.0,20.0,37.099998,98.0,1.02,,3.0,0,white,0white
1,200030,108.78,126.41,67.2,85.685001,22.98,36.876667,95.32,1.02,,3.0,0,black,0black
2,200033,72.489362,120.12766,69.382979,84.425532,15.446809,36.810875,95.382979,1.02,,2.0,0,white,0white
3,200037,139.91,120.0,59.0,78.0,20.0,37.099998,98.0,1.02,,3.0,0,white,0white
4,200039,100.617021,128.755319,49.973404,72.61702,20.053191,37.446808,98.574468,1.02,,3.0,0,white,0white


In [112]:
df_agg_vitals['ckd_ethnicity'].value_counts()

0white               7178
0unknown             1656
0black                902
1white                614
0hispanic             360
0asian                313
1black                 92
1unknown               62
1asian                 21
0alaska_native         21
1hispanic              20
0portuguese            18
0middle_eastern         7
1middle_eastern         3
0pacific_islander       2
1portuguese             1
1alaska_native          1
Name: ckd_ethnicity, dtype: int64

In [113]:
df_agg_labs.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,hematocrit,platelet,ptt,ckd,ethnicity_grouped,ckd_ethnicity
0,200017,2.6,2.0,125.0,29.0,1.0,143.333333,4.833333,16.8,5.2,53.1,267.0,35.8,0,white,0white
1,200030,2.6,2.0,155.428571,25.571429,0.942857,139.571429,3.257143,9.842857,13.5,29.15,172.857143,28.8,0,black,0black
2,200033,2.7,1.0,139.833333,17.333333,0.7,135.5,3.866667,13.033333,10.5,37.433333,160.166667,24.3,0,white,0white
3,200037,2.6,2.0,125.0,29.0,1.0,146.0,5.7,9.8,11.4,29.2,208.0,35.8,0,white,0white
4,200039,3.0,2.0,233.5,6.5,0.5,131.833333,3.333333,9.083333,13.591667,26.091667,331.333333,24.733333,0,white,0white


In [114]:
df_agg_vitals.shape

(11271, 14)

In [115]:
df_agg_labs.shape

(11271, 16)

In [116]:
demographics_windowed.shape

(11271, 29)

In [117]:
print("Vitals unique icustay id: ",len(df_agg_vitals['icustay_id'].unique()),"\nLabs unique icustay id: ",len(df_agg_labs['icustay_id'].unique()),"\nDemographics unique icustay id: ",len(demographics_windowed['icustay_id'].unique()))

Vitals unique icustay id:  11271 
Labs unique icustay id:  11271 
Demographics unique icustay id:  11271


In [118]:
df_agg_vitals.head()

Unnamed: 0,icustay_id,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,specificgravity,pedaledema,appetite_median,ckd,ethnicity_grouped,ckd_ethnicity
0,200017,151.041667,120.0,59.0,78.0,20.0,37.099998,98.0,1.02,,3.0,0,white,0white
1,200030,108.78,126.41,67.2,85.685001,22.98,36.876667,95.32,1.02,,3.0,0,black,0black
2,200033,72.489362,120.12766,69.382979,84.425532,15.446809,36.810875,95.382979,1.02,,2.0,0,white,0white
3,200037,139.91,120.0,59.0,78.0,20.0,37.099998,98.0,1.02,,3.0,0,white,0white
4,200039,100.617021,128.755319,49.973404,72.61702,20.053191,37.446808,98.574468,1.02,,3.0,0,white,0white


In [119]:
df_agg_vitals_new=df_agg_vitals.drop(['ckd','ethnicity_grouped','ckd_ethnicity'],axis=1)

In [120]:
df_agg_labs_new=df_agg_labs.drop(['ckd','ethnicity_grouped'],axis=1)

In [121]:
df_agg_vitals_new.head()

Unnamed: 0,icustay_id,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,specificgravity,pedaledema,appetite_median
0,200017,151.041667,120.0,59.0,78.0,20.0,37.099998,98.0,1.02,,3.0
1,200030,108.78,126.41,67.2,85.685001,22.98,36.876667,95.32,1.02,,3.0
2,200033,72.489362,120.12766,69.382979,84.425532,15.446809,36.810875,95.382979,1.02,,2.0
3,200037,139.91,120.0,59.0,78.0,20.0,37.099998,98.0,1.02,,3.0
4,200039,100.617021,128.755319,49.973404,72.61702,20.053191,37.446808,98.574468,1.02,,3.0


In [122]:
df_agg_labs_new.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,hematocrit,platelet,ptt,ckd_ethnicity
0,200017,2.6,2.0,125.0,29.0,1.0,143.333333,4.833333,16.8,5.2,53.1,267.0,35.8,0white
1,200030,2.6,2.0,155.428571,25.571429,0.942857,139.571429,3.257143,9.842857,13.5,29.15,172.857143,28.8,0black
2,200033,2.7,1.0,139.833333,17.333333,0.7,135.5,3.866667,13.033333,10.5,37.433333,160.166667,24.3,0white
3,200037,2.6,2.0,125.0,29.0,1.0,146.0,5.7,9.8,11.4,29.2,208.0,35.8,0white
4,200039,3.0,2.0,233.5,6.5,0.5,131.833333,3.333333,9.083333,13.591667,26.091667,331.333333,24.733333,0white


#### Merging all 3 tables together

In [182]:
merged_table = df_agg_labs_new.merge(df_agg_vitals_new, on='icustay_id', how='inner').merge(demographics_windowed, on='icustay_id', how='inner')

In [183]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'pedaledema',
       'appetite_median', 'subject_id', 'hadm_id', 'gender', 'dod',
       'admittime', 'dischtime', 'los_hospital', 'admission_age', 'ethnicity',
       'ethnicity_grouped', 'hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq',
       'first_icu_stay_current_hosp', 'first_icu_stay_patient',
       'first_careunit', 'deathtime_icu', 'label_death_icu', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag', 'predtime', 'delta_t_pred'],
      dtype='object')

In [184]:
merged_table.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,...,first_icu_stay_patient,first_careunit,deathtime_icu,label_death_icu,label_cor_art,diabetes_mellitus,ckd,anemia_flag,predtime,delta_t_pred
0,200017,2.6,2.0,125.0,29.0,1.0,143.333333,4.833333,16.8,5.2,...,True,NICU,NaT,0,0,0,0,0,2138-03-19 21:54:36,55 days 19:18:05
1,200030,2.6,2.0,155.428571,25.571429,0.942857,139.571429,3.257143,9.842857,13.5,...,True,MICU,NaT,0,0,0,0,1,2150-11-15 14:08:02,4 days 03:43:57
2,200033,2.7,1.0,139.833333,17.333333,0.7,135.5,3.866667,13.033333,10.5,...,True,SICU,2198-08-21 11:15:00,1,0,1,0,0,2198-08-09 17:56:17,11 days 21:03:01
3,200037,2.6,2.0,125.0,29.0,1.0,146.0,5.7,9.8,11.4,...,True,NICU,NaT,0,0,0,0,0,2141-08-09 09:29:48,16 days 07:47:08
4,200039,3.0,2.0,233.5,6.5,0.5,131.833333,3.333333,9.083333,13.591667,...,True,CCU,NaT,0,0,1,0,1,2121-12-28 03:25:00,4 days 11:58:00


In [185]:
print("Number of unique icustay_id: ",merged_table['icustay_id'].nunique())
print("Number of CKD vitals:")
dd = merged_table[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

Number of unique icustay_id:  11271
Number of CKD vitals:
0    10457
1      814
Name: ckd, dtype: int64


#### keeping records that have exactly WINDOW_LENGTH data in both labs and vitals

In [186]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Filter the icustay_id where the difference is grater than or equal to Window_length in both labs and vitals
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=WINDOW_LENGTH)) & (vitals_diff == pd.Timedelta(hours=WINDOW_LENGTH))].index.tolist()

# Print the length of icustay_id
print(len(filtered_icustay_ids))

4011


In [187]:
merged_table= merged_table[merged_table['icustay_id'].isin(filtered_icustay_ids)]

In [188]:
merged_table.shape

(4011, 52)

In [189]:
merged_table['ckd'].value_counts()

0    3778
1     233
Name: ckd, dtype: int64

In [190]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'pedaledema',
       'appetite_median', 'subject_id', 'hadm_id', 'gender', 'dod',
       'admittime', 'dischtime', 'los_hospital', 'admission_age', 'ethnicity',
       'ethnicity_grouped', 'hospital_expire_flag', 'hospstay_seq',
       'first_hosp_stay', 'intime', 'outtime', 'los_icu', 'icustay_seq',
       'first_icu_stay_current_hosp', 'first_icu_stay_patient',
       'first_careunit', 'deathtime_icu', 'label_death_icu', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag', 'predtime', 'delta_t_pred'],
      dtype='object')

#### Dropping other irrelevent columns

In [191]:
merged_table=merged_table.drop(['subject_id','hadm_id','dod','admittime', 'dischtime','los_hospital','ethnicity','hospital_expire_flag','hospstay_seq', 'first_hosp_stay', 'intime','outtime', 'los_icu', 'icustay_seq', 'first_icu_stay_current_hosp','first_icu_stay_patient', 'first_careunit', 'deathtime_icu','label_death_icu', 'predtime', 'delta_t_pred'],axis=1)

In [192]:
merged_table.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,...,specificgravity,pedaledema,appetite_median,gender,admission_age,ethnicity_grouped,label_cor_art,diabetes_mellitus,ckd,anemia_flag
5,200045,2.5,2.0,66.571429,16.571429,0.742857,145.714286,4.171429,10.942857,17.714286,...,1.02,,3.0,F,73.941807,white,0,1,0,0
6,200046,2.6,2.0,125.0,29.0,1.0,141.857143,4.757143,16.757143,2.907143,...,1.02,,3.0,F,0.001811,black,0,0,0,0
7,200053,3.2,1.0,140.642857,31.0,3.45,141.5,4.857143,10.185714,9.528571,...,1.02,,3.0,M,40.243953,unknown,0,0,1,1
11,200065,2.171429,2.0,185.571429,31.142857,3.192857,140.0,4.278571,8.35,39.3,...,1.02,,3.0,F,45.112386,white,0,0,0,1
13,200077,3.0,2.0,233.428571,60.571429,3.585714,137.142857,4.771429,8.228571,11.414286,...,1.02,,3.0,M,75.27265,unknown,1,0,0,0


In [193]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'pedaledema',
       'appetite_median', 'gender', 'admission_age', 'ethnicity_grouped',
       'label_cor_art', 'diabetes_mellitus', 'ckd', 'anemia_flag'],
      dtype='object')

#### Dropping rows which have ethnicity_grouped "middle_eastern", "portuguese", "alaska_native", "pacific_islander"

In [194]:
ethnicities_to_drop = ["middle_eastern", "portuguese", "alaska_native", "pacific_islander"]
merged_table = merged_table[~merged_table['ethnicity_grouped'].isin(ethnicities_to_drop)]

In [195]:
merged_table['ckd_ethnicity'].value_counts()

0white       2511
0unknown      684
0black        321
1white        182
0hispanic     128
0asian        118
1unknown       23
1black         19
1hispanic       5
1asian          4
Name: ckd_ethnicity, dtype: int64

Removing pedaledema because it has 8365 missing values out of total 8405 rows

In [196]:
merged_table=merged_table.drop('pedaledema',axis=1)

In [197]:
merged_table.isna().sum().sum()

0

#### Grouping Ages 

In [198]:
age_ranges = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 400]

age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']

merged_table['age_group'] = pd.cut(merged_table['admission_age'], bins=age_ranges, labels=age_labels, right=False)

In [199]:
merged_table.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'appetite_median',
       'gender', 'admission_age', 'ethnicity_grouped', 'label_cor_art',
       'diabetes_mellitus', 'ckd', 'anemia_flag', 'age_group'],
      dtype='object')

In [200]:
merged_table.head()

Unnamed: 0,icustay_id,albumin,bacteria,glucose,bun,creatinine,sodium,potassium,hemoglobin,wbc,...,specificgravity,appetite_median,gender,admission_age,ethnicity_grouped,label_cor_art,diabetes_mellitus,ckd,anemia_flag,age_group
5,200045,2.5,2.0,66.571429,16.571429,0.742857,145.714286,4.171429,10.942857,17.714286,...,1.02,3.0,F,73.941807,white,0,1,0,0,70-79
6,200046,2.6,2.0,125.0,29.0,1.0,141.857143,4.757143,16.757143,2.907143,...,1.02,3.0,F,0.001811,black,0,0,0,0,0-9
7,200053,3.2,1.0,140.642857,31.0,3.45,141.5,4.857143,10.185714,9.528571,...,1.02,3.0,M,40.243953,unknown,0,0,1,1,40-49
11,200065,2.171429,2.0,185.571429,31.142857,3.192857,140.0,4.278571,8.35,39.3,...,1.02,3.0,F,45.112386,white,0,0,0,1,40-49
13,200077,3.0,2.0,233.428571,60.571429,3.585714,137.142857,4.771429,8.228571,11.414286,...,1.02,3.0,M,75.27265,unknown,1,0,0,0,70-79


In [201]:
merged_table['age_group']

5        70-79
6          0-9
7        40-49
11       40-49
13       70-79
         ...  
11256      0-9
11260    60-69
11264    50-59
11265    50-59
11266      0-9
Name: age_group, Length: 3995, dtype: category
Categories (10, object): ['0-9' < '10-19' < '20-29' < '30-39' ... '60-69' < '70-79' < '80-89' < '90+']

In [202]:
merged_table=merged_table.drop('admission_age',axis=1)

#### Train Test Split based solely on ckd column

In [203]:
X= merged_table.drop(['ckd','ckd_ethnicity'],axis=1)
y=merged_table['ckd']

In [204]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, stratify=y)

In [205]:
y_train.value_counts()

0    2821
1     175
Name: ckd, dtype: int64

In [206]:
y_test.value_counts()

0    941
1     58
Name: ckd, dtype: int64

#### Random Under/Over Sampling

In [318]:
'''
rus = RandomUnderSampler(random_state=42)
X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
'''
from imblearn.over_sampling import RandomOverSampler
from imblearn.over_sampling import SMOTE
rus = SMOTE(random_state=42)
X_train_rus= X_train
y_train_rus=y_train
#X_train_rus, y_train_rus = rus.fit_resample(X_train, y_train)
'''
TEST SCORES(UNDER SAMPLING): 
With eth - 
Precision: 0.14551083591331268
Recall: 0.8103448275862069
F1 Score: 0.24671916010498685
Accuracy: 0.7127127127127127
CV F1- 

w/o ethnicity-
Precision: 0.1402439024390244
Recall: 0.7931034482758621
F1 Score: 0.2383419689119171
Accuracy: 0.7057057057057057
cm = confusion_matrix(y_test,y_pre


TEST SCORES (OVER SAMPLING)
with eth- 
Precision: 0.15492957746478872
Recall: 0.7586206896551724
F1 Score: 0.2573099415204678
Accuracy: 0.7457457457457457

without eth-
Precision: 0.14685314685314685
Recall: 0.7241379310344828
F1 Score: 0.2441860465116279
Accuracy: 0.7397397397397397

TEST SCORES (SMOTE)
cv is highest-- 0.90 with eth, 0.89 without eth
with eth-
Precision: 0.17777777777777778
Recall: 0.6896551724137931
F1 Score: 0.2826855123674912
Accuracy: 0.7967967967967968

without eth- 
Precision: 0.17777777777777778
Recall: 0.6896551724137931
F1 Score: 0.2826855123674912
Accuracy: 0.7967967967967968
'''

'\nTEST SCORES(UNDER SAMPLING): \nWith eth - \nPrecision: 0.14551083591331268\nRecall: 0.8103448275862069\nF1 Score: 0.24671916010498685\nAccuracy: 0.7127127127127127\nCV F1- \n\nw/o ethnicity-\nPrecision: 0.1402439024390244\nRecall: 0.7931034482758621\nF1 Score: 0.2383419689119171\nAccuracy: 0.7057057057057057\ncm = confusion_matrix(y_test,y_pre\n\n\nTEST SCORES (OVER SAMPLING)\nwith eth- \nPrecision: 0.15492957746478872\nRecall: 0.7586206896551724\nF1 Score: 0.2573099415204678\nAccuracy: 0.7457457457457457\n\nwithout eth-\nPrecision: 0.14685314685314685\nRecall: 0.7241379310344828\nF1 Score: 0.2441860465116279\nAccuracy: 0.7397397397397397\n'

In [319]:
 y_train_rus.value_counts()

0    2821
1     175
Name: ckd, dtype: int64

In [320]:
X_train_rus.shape

(2996, 28)

#### sepearating ckd_ethnicity again now that it has been stratified 

In [321]:
X_train_rus.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'gender',
       'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'anemia_flag', 'age_group'],
      dtype='object')

In [322]:
X_train_rus = X_train_rus.merge(merged_table[['icustay_id', 'ckd']], on='icustay_id', how='inner')

In [323]:
X_train_rus.shape

(2996, 29)

In [324]:
X_train_rus.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'gender',
       'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'anemia_flag', 'age_group', 'ckd'],
      dtype='object')

In [325]:
y_train_rus= X_train_rus['ckd']
X_train_rus= X_train_rus.drop('ckd',axis=1)

In [326]:
y_train_rus.value_counts()

0    2821
1     175
Name: ckd, dtype: int64

In [327]:
stratified_icustay_id_train=X_train_rus['icustay_id'].unique()
stratified_icustay_id_test=X_test['icustay_id'].unique()

In [328]:
X_train_rus=X_train_rus.drop('icustay_id',axis=1)

#### Encoding

In [329]:
X_onehot_train = pd.get_dummies(X_train_rus)

In [330]:
X_onehot_train.columns

Index(['albumin', 'bacteria', 'glucose', 'bun', 'creatinine', 'sodium',
       'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet', 'ptt',
       'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2',
       'specificgravity', 'appetite_median', 'label_cor_art',
       'diabetes_mellitus', 'anemia_flag', 'gender_F', 'gender_M',
       'ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white', 'age_group_0-9', 'age_group_10-19',
       'age_group_20-29', 'age_group_30-39', 'age_group_40-49',
       'age_group_50-59', 'age_group_60-69', 'age_group_70-79',
       'age_group_80-89', 'age_group_90+'],
      dtype='object')

In [331]:
X_onehot_train, y_train_rus = rus.fit_resample(X_onehot_train, y_train_rus)

#### Random Forest Model

In [332]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

In [333]:
rf = RandomForestClassifier()

In [335]:
grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='f1')
grid_search.fit(X_onehot_train, y_train_rus)

GridSearchCV(cv=5, estimator=RandomForestClassifier(),
             param_grid={'max_depth': [None, 5, 10],
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [1, 2, 4],
                         'min_samples_split': [2, 5, 10],
                         'n_estimators': [100, 200, 300]},
             scoring='f1')

In [291]:
grid_search.best_estimator_

RandomForestClassifier(max_depth=5, max_features='sqrt')

In [336]:
grid_search.best_score_

0.975842533898622

#### Using best model with cross validation with ethnicity

In [337]:
rf_same = RandomForestClassifier(max_depth=5, max_features='sqrt')
cv_scores = cross_val_score(rf_same, X_onehot_train, y_train_rus, cv=5, scoring='f1')
cv_scores.mean()

0.900386470081747

#### Now same model, without ethnicity 

In [338]:
rf_same = RandomForestClassifier(max_depth=5, max_features='sqrt')

In [339]:
X_train_no_eth= X_onehot_train.drop(['ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white'],axis=1)

In [340]:
X_train_no_eth.columns

Index(['albumin', 'bacteria', 'glucose', 'bun', 'creatinine', 'sodium',
       'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet', 'ptt',
       'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2',
       'specificgravity', 'appetite_median', 'label_cor_art',
       'diabetes_mellitus', 'anemia_flag', 'gender_F', 'gender_M',
       'age_group_0-9', 'age_group_10-19', 'age_group_20-29',
       'age_group_30-39', 'age_group_40-49', 'age_group_50-59',
       'age_group_60-69', 'age_group_70-79', 'age_group_80-89',
       'age_group_90+'],
      dtype='object')

In [341]:
cv_scores = cross_val_score(rf_same, X_train_no_eth, y_train_rus, cv=5, scoring='f1')

In [342]:
cv_scores

array([0.89795918, 0.90285714, 0.88439774, 0.89473684, 0.90757702])

In [343]:
cv_scores.mean()

0.8975055848491517

#### Test Score with ethnicity

In [344]:
rf_same.fit(X_onehot_train,y_train_rus)

RandomForestClassifier(max_depth=5, max_features='sqrt')

In [345]:
X_test_onehot=pd.get_dummies(X_test)

In [346]:
X_test_onehot.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'label_cor_art',
       'diabetes_mellitus', 'anemia_flag', 'gender_F', 'gender_M',
       'ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white', 'age_group_0-9', 'age_group_10-19',
       'age_group_20-29', 'age_group_30-39', 'age_group_40-49',
       'age_group_50-59', 'age_group_60-69', 'age_group_70-79',
       'age_group_80-89', 'age_group_90+'],
      dtype='object')

In [347]:
X_test_onehot = X_test_onehot.merge(merged_table[['icustay_id', 'ckd']], on='icustay_id', how='inner')

In [348]:
y_test= X_test_onehot['ckd']
X_test_onehot= X_test_onehot.drop('ckd',axis=1)

In [349]:
X_test_onehot=X_test_onehot.drop('icustay_id',axis=1)

In [350]:
y_pred=rf_same.predict(X_test_onehot)

In [351]:
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

In [352]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Precision: 0.17777777777777778
Recall: 0.6896551724137931
F1 Score: 0.2826855123674912
Accuracy: 0.7967967967967968


In [353]:
#Observation 188 means high false positive but very low false negative which is good, true positive are very high too

#The diagonal elements of the matrix represent the number of correctly classified samples (true positives and true negatives), while the off-diagonal elements represent the number of misclassified samples (false positives and false negatives).

cm = confusion_matrix(y_test, y_pred)
print(cm)

[[756 185]
 [ 18  40]]


#### Test score without ethnicity

In [354]:
X_test_onehot_noeth=X_test_onehot.drop(['ethnicity_grouped_asian', 'ethnicity_grouped_black',
       'ethnicity_grouped_hispanic', 'ethnicity_grouped_unknown',
       'ethnicity_grouped_white'],axis=1)

In [355]:
rf_same.fit(X_train_no_eth,y_train_rus)

RandomForestClassifier(max_depth=5, max_features='sqrt')

In [356]:
y_pred_noeth=rf_same.predict(X_test_onehot_noeth)

In [357]:
precision = precision_score(y_test, y_pred_noeth)
recall = recall_score(y_test, y_pred_noeth)
f1 = f1_score(y_test, y_pred_noeth)
accuracy = accuracy_score(y_test, y_pred_noeth)

In [358]:
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("Accuracy:", accuracy)

Precision: 0.17721518987341772
Recall: 0.7241379310344828
F1 Score: 0.2847457627118644
Accuracy: 0.7887887887887888


In [267]:
cm = confusion_matrix(y_test,y_pred_noeth)
print(cm)

[[659 282]
 [ 12  46]]


# Time Series

In [682]:
labs_windowed

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
0,228400,2118-05-02 10:00:00,58817.0,13.0,4.1,3.0,23.0,0.7,0.90,108.0,...,19.10,1.0,12.1,140.0,11.0,11.6,2.0,0,2118-05-04 06:24:29,4 days 06:51:19
1,228400,2118-05-02 18:00:00,58817.0,13.0,4.1,3.0,23.0,0.7,0.90,108.0,...,19.10,1.0,12.1,140.0,11.0,11.6,2.0,0,2118-05-04 06:24:29,4 days 06:51:19
2,228400,2118-05-03 02:00:00,58817.0,13.0,4.1,3.0,22.0,0.7,0.90,106.0,...,19.10,1.0,12.1,138.0,14.0,11.6,2.0,0,2118-05-04 06:24:29,4 days 06:51:19
3,228400,2118-05-03 10:00:00,58817.0,13.0,4.1,3.0,22.0,0.7,0.90,106.0,...,19.10,1.0,12.1,138.0,14.0,11.6,2.0,0,2118-05-04 06:24:29,4 days 06:51:19
4,228400,2118-05-03 18:00:00,58817.0,12.0,3.6,3.0,20.0,0.7,0.80,110.0,...,19.80,1.1,12.7,139.0,18.0,9.3,2.0,0,2118-05-04 06:24:29,4 days 06:51:19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
596081,224889,2114-03-12 17:00:00,26649.0,17.0,2.2,2.0,18.0,0.5,0.95,108.0,...,36.50,1.2,13.3,138.0,19.5,6.4,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596082,224889,2114-03-13 01:00:00,26649.0,15.0,2.2,2.0,20.0,0.7,1.00,105.0,...,68.70,1.9,17.1,135.0,20.0,8.8,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596083,224889,2114-03-13 09:00:00,26649.0,15.5,2.5,2.0,22.0,0.8,1.05,106.0,...,104.55,2.0,17.2,139.5,19.0,7.5,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596084,224889,2114-03-13 17:00:00,26649.0,15.5,2.5,2.0,22.0,0.8,1.05,106.0,...,51.80,1.4,14.5,139.5,19.0,7.1,2.0,0,2114-03-14 01:26:12,21 days 16:15:36


In [683]:
labs_windowed[labs_windowed['icustay_id']==218958]

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
19,218958,2132-08-24 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00
20,218958,2132-08-25 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00
21,218958,2132-08-25 08:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00
22,218958,2132-08-25 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00
23,218958,2132-08-26 00:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00
24,218958,2132-08-26 08:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00
25,218958,2132-08-26 16:00:00,14137.0,13.0,2.6,3.0,25.0,8.5,1.0,105.0,...,35.8,1.3,14.5,139.0,29.0,11.4,2.0,0,2132-08-26 17:07:00,12 days 00:34:00


In [684]:
labs_windowed[labs_windowed['icustay_id']==224889]

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
596079,224889,2114-03-12 01:00:00,26649.0,13.0,1.8,2.0,18.0,0.5,0.8,112.0,...,36.5,1.2,13.3,139.0,23.0,2.7,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596080,224889,2114-03-12 09:00:00,26649.0,16.0,1.8,2.0,17.0,0.5,0.85,109.5,...,36.5,1.2,13.3,137.5,20.5,4.45,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596081,224889,2114-03-12 17:00:00,26649.0,17.0,2.2,2.0,18.0,0.5,0.95,108.0,...,36.5,1.2,13.3,138.0,19.5,6.4,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596082,224889,2114-03-13 01:00:00,26649.0,15.0,2.2,2.0,20.0,0.7,1.0,105.0,...,68.7,1.9,17.1,135.0,20.0,8.8,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596083,224889,2114-03-13 09:00:00,26649.0,15.5,2.5,2.0,22.0,0.8,1.05,106.0,...,104.55,2.0,17.2,139.5,19.0,7.5,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596084,224889,2114-03-13 17:00:00,26649.0,15.5,2.5,2.0,22.0,0.8,1.05,106.0,...,51.8,1.4,14.5,139.5,19.0,7.1,2.0,0,2114-03-14 01:26:12,21 days 16:15:36
596085,224889,2114-03-14 01:00:00,26649.0,12.0,2.5,2.0,26.0,1.0,1.0,107.0,...,44.3,1.3,13.9,142.0,18.0,7.6,2.0,0,2114-03-14 01:26:12,21 days 16:15:36


In [685]:
#Observation: 1 patient has approximately half of the data that we actually need
for icu_id in labs_windowed['icustay_id'].unique():
    icu_data =  labs_windowed[labs_windowed['icustay_id'] == icu_id]
    diff = icu_data['charttime'].max() - icu_data['charttime'].min()
    if diff < pd.Timedelta(days=3):
        print(f"ICU stay ID {icu_id}: {diff}")

ICU stay ID 228400: 1 days 16:00:00
ICU stay ID 218958: 2 days 00:00:00
ICU stay ID 241427: 2 days 00:00:00
ICU stay ID 272085: 1 days 16:00:00
ICU stay ID 214236: 1 days 16:00:00
ICU stay ID 285731: 2 days 00:00:00
ICU stay ID 284866: 1 days 16:00:00
ICU stay ID 205010: 2 days 00:00:00
ICU stay ID 282113: 2 days 00:00:00
ICU stay ID 286333: 1 days 16:00:00
ICU stay ID 241223: 2 days 00:00:00
ICU stay ID 282673: 1 days 16:00:00
ICU stay ID 268471: 2 days 00:00:00
ICU stay ID 280534: 1 days 16:00:00
ICU stay ID 237632: 2 days 00:00:00
ICU stay ID 216011: 1 days 16:00:00
ICU stay ID 202419: 1 days 16:00:00
ICU stay ID 277259: 1 days 16:00:00
ICU stay ID 269229: 1 days 16:00:00
ICU stay ID 214619: 1 days 16:00:00
ICU stay ID 217590: 1 days 16:00:00
ICU stay ID 222038: 2 days 00:00:00
ICU stay ID 210325: 1 days 16:00:00
ICU stay ID 245719: 1 days 16:00:00
ICU stay ID 200580: 2 days 00:00:00
ICU stay ID 289655: 2 days 00:00:00
ICU stay ID 260971: 1 days 16:00:00
ICU stay ID 297937: 2 days 0

ICU stay ID 235810: 2 days 00:00:00
ICU stay ID 273970: 2 days 00:00:00
ICU stay ID 252285: 2 days 00:00:00
ICU stay ID 284950: 2 days 00:00:00
ICU stay ID 283889: 1 days 16:00:00
ICU stay ID 253641: 1 days 16:00:00
ICU stay ID 277242: 2 days 00:00:00
ICU stay ID 219136: 1 days 16:00:00
ICU stay ID 206384: 1 days 16:00:00
ICU stay ID 253822: 2 days 00:00:00
ICU stay ID 200704: 2 days 00:00:00
ICU stay ID 247316: 1 days 16:00:00
ICU stay ID 233796: 1 days 16:00:00
ICU stay ID 229665: 2 days 00:00:00
ICU stay ID 222447: 1 days 16:00:00
ICU stay ID 295819: 1 days 16:00:00
ICU stay ID 286239: 2 days 00:00:00
ICU stay ID 262081: 1 days 16:00:00
ICU stay ID 287735: 2 days 00:00:00
ICU stay ID 247609: 2 days 00:00:00
ICU stay ID 214338: 2 days 00:00:00
ICU stay ID 258958: 2 days 00:00:00
ICU stay ID 262980: 2 days 00:00:00
ICU stay ID 237290: 2 days 00:00:00
ICU stay ID 263034: 1 days 16:00:00
ICU stay ID 266070: 1 days 16:00:00
ICU stay ID 299264: 1 days 16:00:00
ICU stay ID 284649: 1 days 1

ICU stay ID 261029: 1 days 16:00:00
ICU stay ID 249137: 1 days 16:00:00
ICU stay ID 223684: 1 days 16:00:00
ICU stay ID 252273: 2 days 00:00:00
ICU stay ID 244463: 2 days 00:00:00
ICU stay ID 267728: 1 days 16:00:00
ICU stay ID 232004: 2 days 00:00:00
ICU stay ID 203732: 2 days 00:00:00
ICU stay ID 276263: 1 days 16:00:00
ICU stay ID 261571: 2 days 00:00:00
ICU stay ID 289752: 2 days 00:00:00
ICU stay ID 224018: 1 days 16:00:00
ICU stay ID 291309: 1 days 16:00:00
ICU stay ID 207413: 2 days 00:00:00
ICU stay ID 201936: 1 days 16:00:00
ICU stay ID 277747: 2 days 00:00:00
ICU stay ID 296702: 1 days 16:00:00
ICU stay ID 264683: 2 days 00:00:00
ICU stay ID 277402: 2 days 00:00:00
ICU stay ID 255814: 2 days 00:00:00
ICU stay ID 240529: 1 days 16:00:00
ICU stay ID 219283: 1 days 16:00:00
ICU stay ID 272049: 2 days 00:00:00
ICU stay ID 275475: 1 days 16:00:00
ICU stay ID 239682: 1 days 16:00:00
ICU stay ID 274493: 2 days 00:00:00
ICU stay ID 276879: 2 days 00:00:00
ICU stay ID 233148: 2 days 0

ICU stay ID 248689: 2 days 00:00:00
ICU stay ID 257922: 1 days 16:00:00
ICU stay ID 211988: 2 days 00:00:00
ICU stay ID 275893: 1 days 16:00:00
ICU stay ID 220094: 1 days 16:00:00
ICU stay ID 248124: 2 days 00:00:00
ICU stay ID 255197: 1 days 16:00:00
ICU stay ID 223263: 1 days 16:00:00
ICU stay ID 296868: 1 days 16:00:00
ICU stay ID 238053: 1 days 16:00:00
ICU stay ID 286453: 1 days 16:00:00
ICU stay ID 247952: 2 days 00:00:00
ICU stay ID 242897: 1 days 16:00:00
ICU stay ID 292491: 1 days 16:00:00
ICU stay ID 212619: 1 days 16:00:00
ICU stay ID 222825: 2 days 00:00:00
ICU stay ID 240977: 2 days 00:00:00
ICU stay ID 223890: 2 days 00:00:00
ICU stay ID 270684: 2 days 00:00:00
ICU stay ID 248177: 2 days 00:00:00
ICU stay ID 223200: 2 days 00:00:00
ICU stay ID 270520: 2 days 00:00:00
ICU stay ID 273348: 1 days 16:00:00
ICU stay ID 207353: 2 days 00:00:00
ICU stay ID 230563: 1 days 16:00:00
ICU stay ID 264207: 1 days 16:00:00
ICU stay ID 269909: 2 days 00:00:00
ICU stay ID 201234: 1 days 1

ICU stay ID 215189: 1 days 16:00:00
ICU stay ID 232317: 2 days 00:00:00
ICU stay ID 240770: 1 days 16:00:00
ICU stay ID 269623: 1 days 16:00:00
ICU stay ID 275835: 1 days 16:00:00
ICU stay ID 234413: 2 days 00:00:00
ICU stay ID 234617: 2 days 00:00:00
ICU stay ID 264466: 1 days 16:00:00
ICU stay ID 279540: 1 days 16:00:00
ICU stay ID 258214: 2 days 00:00:00
ICU stay ID 282050: 1 days 16:00:00
ICU stay ID 247586: 2 days 00:00:00
ICU stay ID 262557: 2 days 00:00:00
ICU stay ID 202294: 1 days 16:00:00
ICU stay ID 244388: 1 days 16:00:00
ICU stay ID 221300: 1 days 16:00:00
ICU stay ID 220778: 2 days 00:00:00
ICU stay ID 297773: 2 days 00:00:00
ICU stay ID 295511: 1 days 16:00:00
ICU stay ID 220051: 1 days 16:00:00
ICU stay ID 241318: 2 days 00:00:00
ICU stay ID 299956: 2 days 00:00:00
ICU stay ID 269109: 2 days 00:00:00
ICU stay ID 210032: 1 days 16:00:00
ICU stay ID 210420: 2 days 00:00:00
ICU stay ID 263690: 1 days 16:00:00
ICU stay ID 291185: 1 days 16:00:00
ICU stay ID 223243: 1 days 1

ICU stay ID 219619: 1 days 16:00:00
ICU stay ID 286927: 1 days 16:00:00
ICU stay ID 211319: 2 days 00:00:00
ICU stay ID 262843: 2 days 00:00:00
ICU stay ID 241660: 2 days 00:00:00
ICU stay ID 258571: 2 days 00:00:00
ICU stay ID 208381: 2 days 00:00:00
ICU stay ID 216651: 2 days 00:00:00
ICU stay ID 231538: 2 days 00:00:00
ICU stay ID 241174: 1 days 16:00:00
ICU stay ID 288360: 1 days 16:00:00
ICU stay ID 247194: 2 days 00:00:00
ICU stay ID 201457: 1 days 16:00:00
ICU stay ID 299387: 1 days 16:00:00
ICU stay ID 210731: 2 days 00:00:00
ICU stay ID 296476: 2 days 00:00:00
ICU stay ID 230858: 1 days 16:00:00
ICU stay ID 250015: 1 days 16:00:00
ICU stay ID 204958: 2 days 00:00:00
ICU stay ID 262941: 2 days 00:00:00
ICU stay ID 206448: 1 days 16:00:00
ICU stay ID 253716: 2 days 00:00:00
ICU stay ID 258667: 1 days 16:00:00
ICU stay ID 268368: 1 days 16:00:00
ICU stay ID 231384: 1 days 16:00:00
ICU stay ID 206981: 1 days 16:00:00
ICU stay ID 237121: 2 days 00:00:00
ICU stay ID 214090: 2 days 0

ICU stay ID 203577: 1 days 16:00:00
ICU stay ID 238805: 1 days 16:00:00
ICU stay ID 237881: 2 days 00:00:00
ICU stay ID 224515: 2 days 00:00:00
ICU stay ID 223421: 1 days 16:00:00
ICU stay ID 299540: 2 days 00:00:00
ICU stay ID 232778: 1 days 16:00:00
ICU stay ID 225415: 1 days 16:00:00
ICU stay ID 229497: 1 days 16:00:00
ICU stay ID 268047: 2 days 00:00:00
ICU stay ID 228782: 2 days 00:00:00
ICU stay ID 291544: 2 days 00:00:00
ICU stay ID 227392: 1 days 16:00:00
ICU stay ID 228462: 2 days 00:00:00
ICU stay ID 274342: 2 days 00:00:00
ICU stay ID 270901: 1 days 16:00:00
ICU stay ID 201678: 2 days 00:00:00
ICU stay ID 249237: 1 days 16:00:00
ICU stay ID 271724: 2 days 00:00:00
ICU stay ID 246244: 2 days 00:00:00
ICU stay ID 225723: 2 days 00:00:00
ICU stay ID 201820: 1 days 16:00:00
ICU stay ID 240698: 1 days 16:00:00
ICU stay ID 281298: 1 days 16:00:00
ICU stay ID 296584: 2 days 00:00:00
ICU stay ID 251517: 1 days 16:00:00
ICU stay ID 207949: 2 days 00:00:00
ICU stay ID 206856: 2 days 0

ICU stay ID 203320: 1 days 16:00:00
ICU stay ID 260021: 1 days 16:00:00
ICU stay ID 205189: 1 days 16:00:00
ICU stay ID 267680: 2 days 00:00:00
ICU stay ID 288671: 2 days 00:00:00
ICU stay ID 228838: 1 days 16:00:00
ICU stay ID 275318: 1 days 16:00:00
ICU stay ID 249910: 1 days 16:00:00
ICU stay ID 297645: 2 days 00:00:00
ICU stay ID 214899: 1 days 16:00:00
ICU stay ID 248745: 1 days 16:00:00
ICU stay ID 295480: 2 days 00:00:00
ICU stay ID 246272: 1 days 16:00:00
ICU stay ID 273134: 1 days 16:00:00
ICU stay ID 210861: 2 days 00:00:00
ICU stay ID 201391: 2 days 00:00:00
ICU stay ID 209249: 1 days 16:00:00
ICU stay ID 212472: 1 days 16:00:00
ICU stay ID 213818: 2 days 00:00:00
ICU stay ID 236073: 2 days 00:00:00
ICU stay ID 257258: 2 days 00:00:00
ICU stay ID 274885: 2 days 00:00:00
ICU stay ID 292096: 1 days 16:00:00
ICU stay ID 250832: 2 days 00:00:00
ICU stay ID 221782: 1 days 16:00:00
ICU stay ID 248569: 1 days 16:00:00
ICU stay ID 241271: 2 days 00:00:00
ICU stay ID 204954: 1 days 1

ICU stay ID 294607: 1 days 16:00:00
ICU stay ID 248135: 2 days 08:00:00
ICU stay ID 213035: 2 days 00:00:00
ICU stay ID 276522: 2 days 00:00:00
ICU stay ID 267405: 1 days 16:00:00
ICU stay ID 289928: 1 days 16:00:00
ICU stay ID 240253: 1 days 16:00:00
ICU stay ID 254255: 2 days 00:00:00
ICU stay ID 201504: 2 days 00:00:00
ICU stay ID 276952: 2 days 00:00:00
ICU stay ID 257906: 1 days 16:00:00
ICU stay ID 279238: 1 days 16:00:00
ICU stay ID 200220: 1 days 16:00:00
ICU stay ID 256763: 2 days 00:00:00
ICU stay ID 254261: 1 days 16:00:00
ICU stay ID 257135: 1 days 16:00:00
ICU stay ID 220698: 1 days 16:00:00
ICU stay ID 276693: 2 days 00:00:00
ICU stay ID 249694: 2 days 00:00:00
ICU stay ID 214579: 1 days 16:00:00
ICU stay ID 205715: 2 days 00:00:00
ICU stay ID 231177: 1 days 16:00:00
ICU stay ID 241454: 2 days 00:00:00
ICU stay ID 243277: 2 days 00:00:00
ICU stay ID 295560: 1 days 16:00:00
ICU stay ID 278096: 2 days 00:00:00
ICU stay ID 211414: 2 days 00:00:00
ICU stay ID 205857: 2 days 0

ICU stay ID 254674: 2 days 00:00:00
ICU stay ID 266089: 1 days 16:00:00
ICU stay ID 220522: 2 days 00:00:00
ICU stay ID 210804: 2 days 00:00:00
ICU stay ID 264020: 1 days 16:00:00
ICU stay ID 252746: 2 days 00:00:00
ICU stay ID 272710: 2 days 00:00:00
ICU stay ID 209026: 1 days 16:00:00
ICU stay ID 216825: 2 days 00:00:00
ICU stay ID 249244: 2 days 00:00:00
ICU stay ID 218891: 2 days 00:00:00
ICU stay ID 241301: 1 days 16:00:00
ICU stay ID 208938: 2 days 00:00:00
ICU stay ID 271934: 2 days 00:00:00
ICU stay ID 249542: 2 days 00:00:00
ICU stay ID 293145: 1 days 16:00:00
ICU stay ID 298063: 1 days 16:00:00
ICU stay ID 291924: 2 days 08:00:00
ICU stay ID 264053: 2 days 00:00:00
ICU stay ID 279535: 1 days 16:00:00
ICU stay ID 210947: 1 days 16:00:00
ICU stay ID 249260: 2 days 00:00:00
ICU stay ID 265287: 1 days 16:00:00
ICU stay ID 224559: 2 days 00:00:00
ICU stay ID 225730: 1 days 16:00:00
ICU stay ID 215308: 2 days 00:00:00
ICU stay ID 260338: 2 days 00:00:00
ICU stay ID 284229: 1 days 1

ICU stay ID 215138: 2 days 00:00:00
ICU stay ID 281032: 2 days 00:00:00
ICU stay ID 221129: 2 days 00:00:00
ICU stay ID 240402: 1 days 16:00:00
ICU stay ID 211200: 2 days 00:00:00
ICU stay ID 273720: 2 days 00:00:00
ICU stay ID 209904: 2 days 00:00:00
ICU stay ID 255288: 1 days 16:00:00
ICU stay ID 290982: 1 days 16:00:00
ICU stay ID 241894: 2 days 00:00:00
ICU stay ID 268458: 2 days 00:00:00
ICU stay ID 223348: 1 days 16:00:00
ICU stay ID 274205: 1 days 16:00:00
ICU stay ID 299736: 1 days 16:00:00
ICU stay ID 294227: 2 days 00:00:00
ICU stay ID 229485: 2 days 00:00:00
ICU stay ID 203433: 1 days 16:00:00
ICU stay ID 246784: 2 days 00:00:00
ICU stay ID 204757: 1 days 16:00:00
ICU stay ID 231030: 2 days 00:00:00
ICU stay ID 259666: 2 days 00:00:00
ICU stay ID 252799: 1 days 16:00:00
ICU stay ID 205192: 2 days 00:00:00
ICU stay ID 276965: 1 days 16:00:00
ICU stay ID 213259: 1 days 16:00:00
ICU stay ID 225740: 2 days 00:00:00
ICU stay ID 227897: 1 days 16:00:00
ICU stay ID 249768: 1 days 1

ICU stay ID 214622: 2 days 00:00:00
ICU stay ID 269369: 1 days 16:00:00
ICU stay ID 212120: 2 days 00:00:00
ICU stay ID 240534: 1 days 16:00:00
ICU stay ID 205768: 2 days 00:00:00
ICU stay ID 246026: 2 days 00:00:00
ICU stay ID 222479: 1 days 16:00:00
ICU stay ID 231734: 2 days 00:00:00
ICU stay ID 225411: 2 days 00:00:00
ICU stay ID 248919: 2 days 00:00:00
ICU stay ID 202807: 2 days 00:00:00
ICU stay ID 291053: 1 days 16:00:00
ICU stay ID 209106: 2 days 00:00:00
ICU stay ID 211925: 1 days 16:00:00
ICU stay ID 293189: 2 days 00:00:00
ICU stay ID 240344: 2 days 00:00:00
ICU stay ID 244903: 1 days 16:00:00
ICU stay ID 259537: 1 days 16:00:00
ICU stay ID 217041: 1 days 16:00:00
ICU stay ID 210579: 2 days 00:00:00
ICU stay ID 295492: 1 days 16:00:00
ICU stay ID 266410: 2 days 00:00:00
ICU stay ID 294209: 1 days 16:00:00
ICU stay ID 274943: 2 days 00:00:00
ICU stay ID 265095: 1 days 16:00:00
ICU stay ID 240882: 2 days 00:00:00
ICU stay ID 272391: 1 days 16:00:00
ICU stay ID 294573: 1 days 1

ICU stay ID 219621: 2 days 00:00:00
ICU stay ID 248476: 1 days 16:00:00
ICU stay ID 220604: 2 days 00:00:00
ICU stay ID 265115: 1 days 16:00:00
ICU stay ID 203766: 2 days 00:00:00
ICU stay ID 290911: 2 days 00:00:00
ICU stay ID 259634: 1 days 16:00:00
ICU stay ID 280347: 2 days 00:00:00
ICU stay ID 233821: 2 days 00:00:00
ICU stay ID 297480: 2 days 00:00:00
ICU stay ID 232018: 1 days 16:00:00
ICU stay ID 243917: 2 days 00:00:00
ICU stay ID 252250: 1 days 16:00:00
ICU stay ID 289514: 2 days 00:00:00
ICU stay ID 224293: 2 days 00:00:00
ICU stay ID 215213: 2 days 00:00:00
ICU stay ID 258962: 1 days 16:00:00
ICU stay ID 266967: 1 days 16:00:00
ICU stay ID 279680: 1 days 16:00:00
ICU stay ID 234115: 1 days 16:00:00
ICU stay ID 228304: 1 days 16:00:00
ICU stay ID 291693: 2 days 00:00:00
ICU stay ID 230937: 1 days 16:00:00
ICU stay ID 263318: 2 days 00:00:00
ICU stay ID 285766: 2 days 00:00:00
ICU stay ID 291492: 1 days 16:00:00
ICU stay ID 213572: 2 days 00:00:00
ICU stay ID 228114: 1 days 1

ICU stay ID 268189: 2 days 00:00:00
ICU stay ID 264054: 2 days 00:00:00
ICU stay ID 297379: 2 days 00:00:00
ICU stay ID 284765: 2 days 00:00:00
ICU stay ID 272931: 2 days 00:00:00
ICU stay ID 211119: 1 days 16:00:00
ICU stay ID 258542: 1 days 16:00:00
ICU stay ID 257846: 2 days 00:00:00
ICU stay ID 233643: 1 days 16:00:00
ICU stay ID 246314: 2 days 00:00:00
ICU stay ID 256682: 2 days 00:00:00
ICU stay ID 201050: 1 days 16:00:00
ICU stay ID 271967: 2 days 00:00:00
ICU stay ID 285674: 2 days 00:00:00
ICU stay ID 227857: 2 days 00:00:00
ICU stay ID 299867: 1 days 16:00:00
ICU stay ID 294978: 2 days 00:00:00
ICU stay ID 232341: 1 days 16:00:00
ICU stay ID 200229: 2 days 00:00:00
ICU stay ID 298736: 1 days 16:00:00
ICU stay ID 251035: 1 days 16:00:00
ICU stay ID 255168: 2 days 00:00:00
ICU stay ID 284823: 1 days 16:00:00
ICU stay ID 233725: 1 days 16:00:00
ICU stay ID 219548: 1 days 16:00:00
ICU stay ID 277861: 2 days 00:00:00
ICU stay ID 272784: 1 days 16:00:00
ICU stay ID 261754: 1 days 1

ICU stay ID 299963: 2 days 00:00:00
ICU stay ID 245787: 1 days 16:00:00
ICU stay ID 221510: 2 days 00:00:00
ICU stay ID 253260: 2 days 00:00:00
ICU stay ID 290574: 2 days 00:00:00
ICU stay ID 207606: 2 days 00:00:00
ICU stay ID 293299: 1 days 16:00:00
ICU stay ID 284994: 1 days 16:00:00
ICU stay ID 225832: 1 days 16:00:00
ICU stay ID 202370: 2 days 00:00:00
ICU stay ID 299071: 1 days 16:00:00
ICU stay ID 262899: 2 days 00:00:00
ICU stay ID 295414: 2 days 00:00:00
ICU stay ID 295187: 1 days 16:00:00
ICU stay ID 261533: 1 days 16:00:00
ICU stay ID 248152: 2 days 00:00:00
ICU stay ID 287332: 2 days 00:00:00
ICU stay ID 299647: 2 days 00:00:00
ICU stay ID 251887: 1 days 16:00:00
ICU stay ID 240041: 1 days 16:00:00
ICU stay ID 239620: 2 days 00:00:00
ICU stay ID 210780: 2 days 00:00:00
ICU stay ID 286292: 1 days 16:00:00
ICU stay ID 292775: 2 days 00:00:00
ICU stay ID 220338: 2 days 00:00:00
ICU stay ID 227445: 2 days 00:00:00
ICU stay ID 208582: 2 days 00:00:00
ICU stay ID 248613: 2 days 0

ICU stay ID 274180: 2 days 00:00:00
ICU stay ID 206699: 2 days 00:00:00
ICU stay ID 215288: 1 days 16:00:00
ICU stay ID 271685: 2 days 00:00:00
ICU stay ID 247886: 1 days 16:00:00
ICU stay ID 268271: 2 days 00:00:00
ICU stay ID 274098: 1 days 16:00:00
ICU stay ID 229627: 1 days 16:00:00
ICU stay ID 204938: 1 days 16:00:00
ICU stay ID 223546: 2 days 00:00:00
ICU stay ID 283234: 1 days 16:00:00
ICU stay ID 270295: 1 days 16:00:00
ICU stay ID 277393: 1 days 16:00:00
ICU stay ID 227224: 2 days 00:00:00
ICU stay ID 202136: 2 days 00:00:00
ICU stay ID 274172: 2 days 00:00:00
ICU stay ID 273571: 2 days 00:00:00
ICU stay ID 299909: 2 days 00:00:00
ICU stay ID 284603: 1 days 16:00:00
ICU stay ID 200249: 2 days 00:00:00
ICU stay ID 262679: 2 days 00:00:00
ICU stay ID 283300: 1 days 00:00:00
ICU stay ID 217070: 2 days 00:00:00
ICU stay ID 238126: 1 days 16:00:00
ICU stay ID 228980: 2 days 00:00:00
ICU stay ID 227703: 1 days 16:00:00
ICU stay ID 269543: 1 days 16:00:00
ICU stay ID 270765: 2 days 0

KeyboardInterrupt: 

In [None]:
#Observation: 3038 instances have a total of 4 days (exact) data in both labs and vitals
ct=0
for icu_id in vitals_windowed['icustay_id'].unique():
    icu_data_v =  vitals_windowed[vitals_windowed['icustay_id'] == icu_id]
    diff_v = icu_data_v['charttime'].max() - icu_data_v['charttime'].min()
    icu_data_l =  labs_windowed[labs_windowed['icustay_id'] == icu_id]
    diff_l = icu_data_l['charttime'].max() - icu_data_l['charttime'].min()
    if diff_v == pd.Timedelta(hours=96) and diff_l == pd.Timedelta(hours=WINDOW_LENGTH):
        ct+=1
print(ct)

#### Filtering records that have exactly 4 days (96 hours) of data in both labs and vitals

In [273]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Filter the icustay_id where the difference is equal to 96 hours (4 days) in both labs and vitals
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=WINDOW_LENGTH)) & (vitals_diff == pd.Timedelta(hours=WINDOW_LENGTH))].index.tolist()

# Print the length of icustay_id
print(len(filtered_icustay_ids))

0


In [270]:
filtered_labs_windowed = labs_windowed[labs_windowed['icustay_id'].isin(filtered_icustay_ids)]

In [272]:
filtered_labs_windowed[filtered_labs_windowed['icustay_id']==253641] 

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred


In [688]:
filtered_labs_windowed.head()

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
60,241427,2151-11-07 01:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,35.8,1.3,14.5,138.0,29.0,11.4,2.0,0,2151-11-09 01:49:18,29 days 17:57:20
61,241427,2151-11-07 09:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,35.8,1.3,14.5,138.0,29.0,11.4,2.0,0,2151-11-09 01:49:18,29 days 17:57:20
62,241427,2151-11-07 17:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,35.8,1.3,14.5,138.0,29.0,11.4,2.0,0,2151-11-09 01:49:18,29 days 17:57:20
63,241427,2151-11-08 01:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,35.8,1.3,14.5,138.0,29.0,11.4,2.0,0,2151-11-09 01:49:18,29 days 17:57:20
64,241427,2151-11-08 09:00:00,28970.0,11.0,2.6,3.0,23.0,4.3,1.0,108.0,...,35.8,1.3,14.5,138.0,29.0,11.4,2.0,0,2151-11-09 01:49:18,29 days 17:57:20


In [689]:
filtered_labs_windowed.columns

Index(['icustay_id', 'charttime', 'subject_id', 'aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria', 'ckd', 'predtime',
       'delta_t_pred'],
      dtype='object')

#### Grouping data

In [425]:
feature_labs= ['aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']

In [426]:
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]

In [427]:
grouped_data_labs.head()

Unnamed: 0,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,glucose,hematocrit,hemoglobin,...,platelet,potassium,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd
305206,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305207,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305208,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305209,18.0,2.5,3.0,17.0,3.2,0.7,113.0,103.0,34.6,11.5,...,234.0,3.7,26.5,1.0,11.8,144.0,14.0,15.4,2.0,0
305210,11.0,2.5,3.0,24.0,3.2,0.8,118.0,18.0,30.1,10.2,...,239.0,4.8,26.5,1.0,11.8,148.0,20.0,20.8,2.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
164145,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0
164146,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0
164147,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0
164148,16.0,2.6,3.0,22.0,4.6,1.0,100.0,125.0,48.5,14.9,...,356.0,5.4,36.0,1.3,14.5,133.0,29.0,15.7,2.0,0


In [428]:
# Concatenate groups into a NumPy array
data = np.concatenate([group.values for _, group in grouped_data_labs])

# Extract features (X) and target (y)
X = data[:, :-1]  # Exclude the last column 'ckd'
y = data[:, -1]   # Select only the last column 'ckd'

# Perform train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [429]:
X

array([[18. ,  2.5,  3. , ..., 14. , 15.4,  2. ],
       [18. ,  2.5,  3. , ..., 14. , 15.4,  2. ],
       [18. ,  2.5,  3. , ..., 14. , 15.4,  2. ],
       ...,
       [15. ,  2.6,  3. , ..., 29. , 15.7,  2. ],
       [15. ,  2.6,  3. , ..., 29. , 15.7,  2. ],
       [15. ,  2.6,  3. , ..., 29. , 15.7,  2. ]])

In [430]:
X.shape

(39494, 20)

In [431]:
y.shape

(39494,)

In [432]:
np.unique(y)

array([0., 1.])

In [433]:
X_train.shape

(31595, 20)

In [434]:
y_train.shape

(31595,)

#### Rocket for lab (working)

In [435]:
X = []
y = []
for _, group in grouped_data_labs:
    group_values = group[feature_labs].values.T
        
    X.append(group_values)
    y.append(group['ckd'].iloc[0])
    
X = np.array(X)
y = np.array(y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [436]:
X.shape

(3038, 20, 13)

In [437]:
y.shape

(3038,)

#### Random undersampling for labs

In [438]:
n_samples, n_features, n_channels = X.shape
X_2d = X.reshape((n_samples, n_features*n_channels))
rusLab = RandomUnderSampler(random_state=42)
X_resampled, y_resampled = rusLab.fit_resample(X_2d, y)
X_resampled = X_resampled.reshape((X_resampled.shape[0], n_features, n_channels))    
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)

In [439]:
rocket = Rocket(num_kernels=100)
rocket.fit(X_train)
X_train_transformed = rocket.transform(X_train)
X_test_transformed = rocket.transform(X_test)

In [440]:
clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
    
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

Cross-validation scores: [0.64       0.62       0.52       0.78       0.71428571]
Mean cross-validation score: 0.6548571428571429


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [441]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    #accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    #print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

Cross-validation scores: [0.66       0.62       0.5        0.78       0.71428571]
Mean cross-validation score: 0.6548571428571429
Precision: 0.6129032258064516
Recall: 0.6129032258064516
F1 Score: 0.6129032258064516
Accuracy: 0.6190476190476191

Classification report:
              precision    recall  f1-score   support

           0       0.62      0.62      0.62        32
           1       0.61      0.61      0.61        31

    accuracy                           0.62        63
   macro avg       0.62      0.62      0.62        63
weighted avg       0.62      0.62      0.62        63



#### with only albumin and creatinine

In [140]:
feature_labs= ['albumin','creatinine']
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]
grouped_data_labs.head()

Unnamed: 0,albumin,creatinine,ckd
305206,2.5,0.7,0
305207,2.5,0.7,0
305208,2.5,0.7,0
305209,2.5,0.7,0
305210,2.5,0.8,0
...,...,...,...
164145,2.6,1.0,0
164146,2.6,1.0,0
164147,2.6,1.0,0
164148,2.6,1.0,0


In [145]:
X = []
y = []
for _, group in grouped_data_labs:
    group_values = group[feature_labs].values.T
        
    X.append(group_values)
    y.append(group['ckd'].iloc[0])
    
X = np.array(X)
y = np.array(y)
    
n_samples, n_features, n_channels = X.shape
X_2d = X.reshape((n_samples, n_features*n_channels))

rusLabCrAl = RandomUnderSampler(random_state=42)
X_resampledCrAl, y_resampledCrAl = rusLabCrAl.fit_resample(X_2d, y)

X_resampledCrAl = X_resampledCrAl.reshape((X_resampledCrAl.shape[0], n_features, n_channels))    
X_train, X_test, y_train, y_test = train_test_split(X_resampledCrAl, y_resampledCrAl, test_size=0.2)

In [146]:
rocket = Rocket(num_kernels=100)
rocket.fit(X_train)
X_train_transformed = rocket.transform(X_train)
X_test_transformed = rocket.transform(X_test)

In [147]:
clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
    
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

Cross-validation scores: [0.62      0.58      0.62      0.7       0.6122449]
Mean cross-validation score: 0.6264489795918367


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [148]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

Cross-validation scores: [0.6       0.58      0.62      0.7       0.6122449]
Mean cross-validation score: 0.6224489795918368
Accuracy: 0.6984126984126984

Classification report:
              precision    recall  f1-score   support

           0       0.82      0.68      0.75        41
           1       0.55      0.73      0.63        22

    accuracy                           0.70        63
   macro avg       0.69      0.71      0.69        63
weighted avg       0.73      0.70      0.71        63



#### Just albumin 

In [149]:
feature_labs= ['albumin']
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]


X = []
y = []
for _, group in grouped_data_labs:
    group_values = group[feature_labs].values.T
        
    X.append(group_values)
    y.append(group['ckd'].iloc[0])
    
X = np.array(X)
y = np.array(y)
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


rocket = Rocket(num_kernels=100)
rocket.fit(X_train)
X_train_transformed = rocket.transform(X_train)
X_test_transformed = rocket.transform(X_test)


clf = LogisticRegression()
cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", np.mean(cv_scores))
    
clf.fit(X_train_transformed, y_train)
y_pred = clf.predict(X_test_transformed)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores: [0.94444444 0.94855967 0.94855967 0.94238683 0.94444444]
Mean cross-validation score: 0.945679012345679


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [150]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

Cross-validation scores: [0.94444444 0.94855967 0.95061728 0.94650206 0.94238683]
Mean cross-validation score: 0.9465020576131685
Accuracy: 0.9391447368421053

Classification report:
              precision    recall  f1-score   support

           0       0.94      1.00      0.97       571
           1       0.00      0.00      0.00        37

    accuracy                           0.94       608
   macro avg       0.47      0.50      0.48       608
weighted avg       0.88      0.94      0.91       608



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


#### Time series on Vitals

In [442]:
vitals_windowed.head()

Unnamed: 0,icustay_id,charttime,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,glucose,rbc,specificgravity,pedaledema,appetite_median,ckd,time_diff
0,218958,2132-08-24 16:00:00,140.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0,0.0
1,218958,2132-08-24 17:00:00,144.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0,1.0
2,218958,2132-08-24 18:00:00,150.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0,1.0
3,218958,2132-08-24 19:00:00,144.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0,1.0
4,218958,2132-08-24 20:00:00,126.0,120.0,58.5,78.0,20.5,37.111111,98.0,130.0,4.75,1.02,-1.0,3.0,0,1.0


In [443]:
vv = vitals_windowed
vv['charttime'] = pd.to_datetime(vv['charttime'])
vv['time_diff'] = vv.groupby('icustay_id')['charttime'].diff().dt.total_seconds() / 3600
vv['time_diff'].fillna(0, inplace=True)

column_sum = vv.groupby('icustay_id')['time_diff'].sum()
column_sum_df = column_sum.to_frame().reset_index()
column_sum_df.columns = ['icustay_id', 'time_diff']
#filtered_df = vitals_windowed[vitals_windowed['icustay_id'] == 286937]
column_sum_df.describe()

Unnamed: 0,icustay_id,time_diff
count,8405.0,8405.0
mean,249984.620345,95.661273
std,28585.253432,1.806917
min,200017.0,71.0
25%,225578.0,95.0
50%,250040.0,96.0
75%,274430.0,96.0
max,299992.0,107.0


In [444]:
filtered_df = vitals_windowed[vitals_windowed['icustay_id'] == 286937]
filtered_df

Unnamed: 0,icustay_id,charttime,heartrate,sysbp,diasbp,meanbp,resprate,tempc,spo2,glucose,rbc,specificgravity,pedaledema,appetite_median,ckd,time_diff
397810,286937,2142-08-06 03:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,0.0
397811,286937,2142-08-06 04:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397812,286937,2142-08-06 05:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397813,286937,2142-08-06 06:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397814,286937,2142-08-06 07:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
397877,286937,2142-08-08 22:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397878,286937,2142-08-08 23:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397879,286937,2142-08-09 00:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0
397880,286937,2142-08-09 01:00:00,67.0,112.0,39.0,62.0,21.0,37.900002,99.0,173.0,3.99,1.02,-1.0,3.0,0,1.0


In [702]:
def RocketForVitals(vitals_windowed):
    vitals_windowed = vitals_windowed[vitals_windowed['icustay_id'].isin(filtered_icustay_ids)]
    vitals_windowed['charttime'] = pd.to_datetime(vitals_windowed['charttime'])
    feature_columns = ['heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity', 'appetite_median']
    
    grouped_data = vitals_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_columns + ['ckd']]
    
    X = []
    y = []
    for _, group in grouped_data:
        group_values = group[feature_columns].values.T
        num_timestamps = group_values.shape[1]
        
        if num_timestamps < WINDOW_LENGTH:
            padded_values = np.pad(group_values, ((0, 0), (0, WINDOW_LENGTH - num_timestamps)), 'constant', constant_values=0)
        elif num_timestamps > WINDOW_LENGTH:        
            padded_values = group_values[:, :WINDOW_LENGTH]
        else:        
            padded_values = group_values
        
        X.append(padded_values)
        y.append(group['ckd'].iloc[0])
    
    X = np.array(X)
    y = np.array(y)
    
    n_samples, n_features, n_channels = X.shape
    X_2d = X.reshape((n_samples, n_features*n_channels))

    rus = RandomUnderSampler(random_state=42)
    X_resampled, y_resampled = rus.fit_resample(X_2d, y)

    X_resampled = X_resampled.reshape((X_resampled.shape[0], n_features, n_channels))    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)
    
    #X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    print("X_train shape: ",X_train.shape,"\ny_train shape: ",y_train.shape)
    
    rocket = Rocket(num_kernels=100, random_state=42)
    rocket.fit(X_train)
    X_train_transformed = rocket.transform(X_train)
    X_test_transformed = rocket.transform(X_test)
    return X_train_transformed, X_test_transformed,y_train,y_test

In [703]:
X_train_transformed, X_test_transformed, y_train, y_test = RocketForVitals(vitals_windowed)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  vitals_windowed['charttime'] = pd.to_datetime(vitals_windowed['charttime'])


X_train shape:  (372, 11, 48) 
y_train shape:  (372,)


In [704]:
X_train_transformed.shape, y_train.shape

((372, 200), (372,))

In [705]:
def printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test):
    clf = LogisticRegression(random_state=42, max_iter=1000)
    cv_scores = cross_val_score(clf, X_train_transformed, y_train, cv=5)
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    #accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    #print("Accuracy:", accuracy)
    print("\nClassification report:")
    print(classification_report(y_test, y_pred))

In [706]:
printEvaluationScores(X_train_transformed, X_test_transformed, y_train, y_test)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores: [0.54666667 0.6        0.71621622 0.51351351 0.59459459]
Mean cross-validation score: 0.5941981981981982
Precision: 0.5
Recall: 0.6
F1 Score: 0.5454545454545454
Accuracy: 0.574468085106383

Classification report:
              precision    recall  f1-score   support

           0       0.65      0.56      0.60        54
           1       0.50      0.60      0.55        40

    accuracy                           0.57        94
   macro avg       0.58      0.58      0.57        94
weighted avg       0.59      0.57      0.58        94



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Multimodal Approach

####  labs and vitals

In [697]:
dd = filtered_vitals_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
dd['ckd'].value_counts()

0    3778
1     233
Name: ckd, dtype: int64

In [696]:
filtered_vitals_windowed = vitals_windowed[vitals_windowed['icustay_id'].isin(filtered_icustay_ids)]

In [695]:
dd.shape, dd['icustay_id'].nunique(),filtered_labs_windowed['icustay_id'].nunique()

((11271, 2), 11271, 4011)

In [678]:
# LABS
feature_labs = [ 'albumin', 'bicarbonate', 'creatinine', 'chloride', 'hematocrit', 'hemoglobin', 'potassium', 'sodium', 'bun', 'wbc', 'bacteria']
grouped_data_labs = filtered_labs_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_labs + ['ckd']]

# VITALS
feature_columns = [ 'sysbp', 'diasbp', 'meanbp', 'spo2', 'glucose', 'rbc', 'specificgravity', 'appetite_median']
grouped_data_vitals = vitals_windowed.sort_values(['icustay_id', 'charttime']).groupby('icustay_id')[feature_columns + ['ckd']]

# Get unique icustay_id values
icustay_ids = grouped_data_labs.groups.keys()

# Split icustay_id into training and test sets
train_size = int(0.8 * len(icustay_ids))
train_ids = list(icustay_ids)[:train_size]
test_ids = list(icustay_ids)[train_size:]
test_ids.remove(278494)

# Prepare the data for training (Labs)
X_train_labs = []
X_test_labs = []
y_train_labs = []
y_test_labs = []

for icustay_id, group in grouped_data_labs:
    if icustay_id in train_ids:
        lab_values = group[feature_labs].values.T
        X_train_labs.append(lab_values)
        y_train_labs.append(group['ckd'].iloc[0])
    elif icustay_id in test_ids:
        lab_values = group[feature_labs].values.T
        X_test_labs.append(lab_values)
        y_test_labs.append(group['ckd'].iloc[0])

X_train_labs = np.array(X_train_labs)
X_test_labs = np.array(X_test_labs)
y_train_labs = np.array(y_train_labs)
y_test_labs = np.array(y_test_labs)

n_samples, n_features, n_channels = X_train_labs.shape
X_2d = X_train_labs.reshape((n_samples, n_features * n_channels))
rus = RandomUnderSampler(random_state=42)
X_resampled_labs, y_resampled_labs = rus.fit_resample(X_2d, y_train_labs)
X_resampled_labs = X_resampled_labs.reshape((X_resampled_labs.shape[0], n_features, n_channels))
X_train_labs = X_resampled_labs
y_train_labs = y_resampled_labs

# Prepare the data for training (Vitals)
X_train_vitals = []
X_test_vitals = []
y_train_vitals = []
y_test_vitals = []

for icustay_id, group in grouped_data_vitals:
    if icustay_id in train_ids:
        vital_values = group[feature_columns].values.T
        X_train_vitals.append(vital_values)
        y_train_vitals.append(group['ckd'].iloc[0])
    elif icustay_id in test_ids:
        vital_values = group[feature_columns].values.T
        X_test_vitals.append(vital_values)
        y_test_vitals.append(group['ckd'].iloc[0])

X_train_vitals = np.array(X_train_vitals)
X_test_vitals = np.array(X_test_vitals)
y_train_vitals = np.array(y_train_vitals)
y_test_vitals = np.array(y_test_vitals)

n_samples, n_features, n_channels = X_train_vitals.shape
X_2d = X_train_vitals.reshape((n_samples, n_features * n_channels))
rus = RandomUnderSampler(random_state=42)
X_resampled_vitals, y_resampled_vitals = rus.fit_resample(X_2d, y_train_vitals)
X_resampled_vitals = X_resampled_vitals.reshape((X_resampled_vitals.shape[0], n_features, n_channels))
X_train_vitals = X_resampled_vitals
y_train_vitals = y_resampled_vitals

In [679]:
#Rocket and LR


#Labs
rocket_lab = Rocket(num_kernels=100, random_state=42)
rocket_lab.fit(X_train_labs)
X_train_transformed_labs = rocket_lab.transform(X_train_labs)
X_test_transformed_labs = rocket_lab.transform(X_test_labs) 
clf_lab = LogisticRegression(random_state=42, max_iter=1000)
clf_lab.fit(X_train_transformed_labs,y_train_labs)

#Vitals
rocket_vital = Rocket(num_kernels=100, random_state=42)
rocket_vital.fit(X_train_vitals)
X_train_transformed_vital = rocket_vital.transform(X_train_vitals)
X_test_transformed_vital = rocket_vital.transform(X_test_vitals)

clf_vital = LogisticRegression(random_state=42, max_iter=1000)
clf_vital.fit(X_train_transformed_vital,y_train_vitals)

LogisticRegression(max_iter=1000, random_state=42)

#### Static model

In [534]:
# Split the merged_table into train and test data
train_data = merged_table.loc[merged_table['icustay_id'].isin(train_ids)]
test_data = merged_table.loc[merged_table['icustay_id'].isin(test_ids)]

In [535]:
test_ids_not_in_table = set(test_ids) - set(merged_table['icustay_id'])
print(test_ids_not_in_table)

set()


In [536]:
test_data.shape,len(test_ids)

((607, 30), 607)

In [537]:
train_data.shape

(2417, 30)

In [538]:
train_data.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'ckd_ethnicity', 'heartrate', 'sysbp', 'diasbp', 'meanbp',
       'resprate', 'tempc', 'spo2', 'specificgravity', 'appetite_median',
       'gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'ckd', 'anemia_flag', 'age_group'],
      dtype='object')

In [539]:
X_train_static=train_data['gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus',
       'ckd', 'anemia_flag', 'age_group']    #,'ethnicity_grouped'
y_train_static=train_data['ckd']
X_train_static=pd.get_dummies(X_train_static)

In [540]:
X_test_static=test_data.drop(['icustay_id','ckd','platelet',
       'ptt', 'ckd_ethnicity', 'heartrate',
       'resprate', 'tempc'],axis=1)     #,'ethnicity_grouped'
y_test_static=test_data['ckd']
X_test_static=pd.get_dummies(X_test_static)

In [546]:
rus = RandomUnderSampler(random_state=42)
X_resampled_static, y_resampled_static = rus.fit_resample(X_train_static, y_train_static)
#X_resampled_vitals = X_resampled_vitals.reshape((X_resampled_vitals.shape[0], n_features, n_channels))
X_train_static = X_resampled_static
y_train_static = y_resampled_static
rf_same.fit(X_train_static,y_train_static)

RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)

In [547]:
y_train_static.shape,y_train_labs.shape

((262,), (262,))

#### Static with just demographics

In [564]:
# Split the merged_table into train and test data
train_data = merged_table.loc[merged_table['icustay_id'].isin(train_ids)]
test_data = merged_table.loc[merged_table['icustay_id'].isin(test_ids)]

test_ids_not_in_table = set(test_ids) - set(merged_table['icustay_id'])
print(test_ids_not_in_table)

X_train_static=train_data[['gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus', 'anemia_flag', 'age_group']]
y_train_static=train_data['ckd']
X_train_static=pd.get_dummies(X_train_static)

X_test_static=test_data[['gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus', 'anemia_flag', 'age_group']]
y_test_static=test_data['ckd']
X_test_static=pd.get_dummies(X_test_static)

rus = RandomUnderSampler(random_state=42)
X_resampled_static, y_resampled_static = rus.fit_resample(X_train_static, y_train_static)
#X_resampled_vitals = X_resampled_vitals.reshape((X_resampled_vitals.shape[0], n_features, n_channels))
X_train_static = X_resampled_static
y_train_static = y_resampled_static
rf_same.fit(X_train_static,y_train_static)

set()


RandomForestClassifier(max_depth=10, max_features='sqrt', min_samples_leaf=4,
                       min_samples_split=5, n_estimators=300)

In [571]:
def soft_voting(clf_lab, clf_vital,clf3, weights, X_test_transformed_lab, X_test_transformed_vital, y_test_labs,y_test_vitals,Xstatic,ystatic):
    # Obtain the probability estimates for each class
    prob_lab = clf_lab.predict_proba(X_test_transformed_lab)
    prob_vital = clf_vital.predict_proba(X_test_transformed_vital)
    prob_static = clf3.predict_proba(Xstatic)

    # Combine the probability estimates using weighted averaging
    weighted_prob = (weights[0] * prob_lab + weights[1] * prob_vital  +  weights[2] * prob_static) / np.sum(weights)
    #print(weighted_prob)

    # Make the final prediction based on the highest probability
    y_pred = np.argmax(weighted_prob, axis=1)
    
    #print(np.sum(y_test_labs))

    # Calculate accuracy
    acc = np.sum(y_pred == y_test_labs)/len(y_pred)
    #accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test_labs, y_pred)
    recall = recall_score(y_test_labs, y_pred)
    f1 = f1_score(y_test_labs, y_pred)
    accuracy = accuracy_score(y_test_labs, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    #print("Accuracy:", accuracy)
    #print(acc)

    return y_pred

#### Obtaining Weights

In [572]:
def printEvaluationScores(X_train_static, y_train_static):
    cv_scores = cross_val_score(rf_same, X_train_static, y_train_static, cv=5, scoring='f1')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_static, y_train_static, cv=5, scoring='precision')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_static, y_train_static, cv=5, scoring='recall')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_static, y_train_static, cv=5, scoring='accuracy')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
printEvaluationScores(X_train_static, y_train_static)

Cross-validation scores: [0.80701754 0.6969697  0.78571429 0.71698113 0.6875    ]
Mean cross-validation score: 0.7388365317238207
Cross-validation scores: [0.75       0.58974359 0.6875     0.7037037  0.6       ]
Mean cross-validation score: 0.6661894586894588
Cross-validation scores: [0.92307692 0.85185185 0.84615385 0.73076923 0.96153846]
Mean cross-validation score: 0.8626780626780628
Cross-validation scores: [0.77358491 0.64150943 0.76923077 0.73076923 0.61538462]
Mean cross-validation score: 0.7060957910014514


In [582]:
def printEvaluationScores(X_train_transformed_labs, y_train_labs):
    cv_scores = cross_val_score(rf_same, X_train_transformed_labs,y_train_labs, cv=5, scoring='f1')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same,X_train_transformed_labs,y_train_labs, cv=5, scoring='precision')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_transformed_labs, y_train_labs, cv=5, scoring='recall')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_transformed_labs,y_train_labs, cv=5, scoring='accuracy')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
printEvaluationScores(X_train_transformed_labs, y_train_labs)

Cross-validation scores: [0.72131148 0.64615385 0.69444444 0.72463768 0.68965517]
Mean cross-validation score: 0.695240523916268
Cross-validation scores: [0.63157895 0.53846154 0.55555556 0.55555556 0.64705882]
Mean cross-validation score: 0.5856420840940965
Cross-validation scores: [0.80769231 0.74074074 1.         0.92307692 0.80769231]
Mean cross-validation score: 0.8558404558404558
Cross-validation scores: [0.66037736 0.49056604 0.57692308 0.63461538 0.69230769]
Mean cross-validation score: 0.6109579100145138


In [580]:
def printEvaluationScores(X_train_transformed_vital, y_train_vitals):
    cv_scores = cross_val_score(rf_same, X_train_transformed_vital, y_train_vitals, cv=5, scoring='f1')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_transformed_vital,y_train_vitals, cv=5, scoring='precision')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_transformed_vital,y_train_vitals, cv=5, scoring='recall')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
    cv_scores = cross_val_score(rf_same, X_train_transformed_vital,y_train_vitals, cv=5, scoring='accuracy')
    print("Cross-validation scores:", cv_scores)
    print("Mean cross-validation score:", np.mean(cv_scores))
printEvaluationScores(X_train_transformed_vital, y_train_vitals)

Cross-validation scores: [0.77966102 0.65671642 0.71428571 0.68656716 0.73684211]
Mean cross-validation score: 0.7148144837175155
Cross-validation scores: [0.71875    0.575      0.57142857 0.58139535 0.63636364]
Mean cross-validation score: 0.6165875113258834
Cross-validation scores: [0.88461538 0.88888889 0.96153846 0.92307692 0.84615385]
Mean cross-validation score: 0.9008547008547009
Cross-validation scores: [0.75471698 0.58490566 0.59615385 0.61538462 0.65384615]
Mean cross-validation score: 0.6410014513788098


In [575]:
weight_l=0.59/(1-0.59)
weight_v=0.62/(1-0.62)
weight_s=0.67/(1-0.67)

In [576]:
soft_voting(clf_lab=clf_lab, clf_vital=clf_vital,clf3=rf_same, weights=[weight_l,weight_v,weight_s],X_test_transformed_lab=X_test_transformed_labs, X_test_transformed_vital=X_test_transformed_vital, y_test_labs=y_test_labs, y_test_vitals=y_test_vitals,Xstatic=X_test_static,ystatic=y_test_static)

Precision: 0.07553956834532374
Recall: 0.84
F1 Score: 0.13861386138613863
Accuracy: 0.5700164744645799


array([1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0,
       0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0,
       0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0, 1,
       0, 0, 1, 0, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0,
       0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 0,
       1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 0, 0, 0,
       1, 0, 0, 1, 1, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,
       1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 0, 0,
       1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 1, 0, 0,