In [1]:
import psycopg2
from datetime import timedelta
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sktime.transformations.panel.rocket import Rocket
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="mimic",
    user="postgres",
    password="postgres"
)

cur = conn.cursor()

cur.execute("SELECT version();")
print(cur.fetchone())

('PostgreSQL 15.2, compiled by Visual C++ build 1914, 64-bit',)


In [3]:
# Connect to db
conn = psycopg2.connect(host='localhost', dbname='mimic', user='postgres', password='postgres', options='-c search_path=mimiciii')
cur = conn.cursor() 

# Read in table with patients & admissions (inner join on subject_id) and icu_stays (inner joinon subject_id and hadm_id)
icustay_details = pd.read_sql_query("SELECT * FROM mimiciii.flicu_icustay_detail;", conn)

# Read in vital and lab signs
pivoted_vital = pd.read_sql_query("SELECT * FROM mimiciii.pivoted_vital;", conn)
pivoted_lab = pd.read_sql_query("SELECT * FROM mimiciii.ckd_pivoted_lab;", conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()

  icustay_details = pd.read_sql_query("SELECT * FROM mimiciii.flicu_icustay_detail;", conn)
  pivoted_vital = pd.read_sql_query("SELECT * FROM mimiciii.pivoted_vital;", conn)
  pivoted_lab = pd.read_sql_query("SELECT * FROM mimiciii.ckd_pivoted_lab;", conn)


In [4]:
WINDOW_LENGTH = 96

In [5]:
data= icustay_details.copy()
data = data[data.los_icu >= WINDOW_LENGTH/24.0]

In [6]:
filtered_icustay_ids = pd.DataFrame(data['icustay_id'].unique(), columns=['icustay_id'])

In [7]:
# Drop measurements with no belonging icustay_id
pivoted_vital = pivoted_vital.dropna(subset=['icustay_id'])
pivoted_lab = pivoted_lab.dropna(subset=['icustay_id'])

# Cast icustay_id types to int
pivoted_vital['icustay_id'] = pivoted_vital['icustay_id'].astype(int)
pivoted_lab['icustay_id'] = pivoted_lab['icustay_id'].astype(int)

# Keep only values of patients in previously filtered icustay_ids in labs and vitals
pivoted_vital = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
pivoted_lab = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()

In [8]:
# Min of each lab and vitals
icustay_ids_charttime_min_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
icustay_ids_charttime_min_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
# Min of both combined
icustay_ids_charttime_min_vital_lab = pd.concat([icustay_ids_charttime_min_lab, icustay_ids_charttime_min_vital], ignore_index=True)
icustay_ids_charttime_min_vital_lab = icustay_ids_charttime_min_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_min_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]

# Max of each lab and vitals
icustay_ids_charttime_max_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
icustay_ids_charttime_max_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
# Max of both combined
icustay_ids_charttime_max_vital_lab = pd.concat([icustay_ids_charttime_max_lab, icustay_ids_charttime_max_vital], ignore_index=True)
icustay_ids_charttime_max_vital_lab = icustay_ids_charttime_max_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_max_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]

In [9]:
# Find for which icustay_ids there exist at least WINDOW_LENGTH of data
icustay_ids_vital_lab_charttime_min_max = pd.concat([icustay_ids_charttime_max_vital_lab, icustay_ids_charttime_min_vital_lab], ignore_index=True)
time_window = timedelta(days=4, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
is_time_diff_bigger_window_lab = icustay_ids_vital_lab_charttime_min_max.groupby(['icustay_id'])['charttime'].transform(lambda x: (x.max()-x.min())) >= time_window

icustay_ids_vital_lab_charttime_min_max_filtered = icustay_ids_vital_lab_charttime_min_max[is_time_diff_bigger_window_lab]
print("Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering", icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].nunique())

# Keep only icustay ids for which at least WINDOW_LENGTH of data exists
icustay_ids_time_filtered = pd.DataFrame(icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].unique(), columns=['icustay_id'])
print("Unique icu stays in icustay_ids_time_filtered: ", icustay_ids_time_filtered['icustay_id'].nunique())

Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering 8409
Unique icu stays in icustay_ids_time_filtered:  8409


In [10]:
filtered_icustay_ids = filtered_icustay_ids.merge(icustay_ids_time_filtered, on='icustay_id', how='inner').drop_duplicates()

In [11]:
demographics_filtered = data.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays demographics: ", demographics_filtered['icustay_id'].nunique())

vital_filtered = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays vitals: ", vital_filtered['icustay_id'].nunique())

lab_filtered = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays labs: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays demographics:  8409
Number of ICU stays vitals:  8409
Number of ICU stays labs:  8409


In [12]:
vital_filtered = vital_filtered.merge(lab_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", vital_filtered['icustay_id'].nunique())
lab_filtered = lab_filtered.merge(vital_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays in lab_filtered:  8409
Number of ICU stays in lab_filtered:  8409


In [13]:
vital_resampled = vital_filtered.copy()

# Resample from the end of the time series (how="last")
vital_resampled = vital_resampled.assign(charttime=vital_resampled.charttime.dt.round('H'))

# Resample from the beginning of the time series
vital_resampled = vital_resampled.set_index('charttime').groupby('icustay_id').resample('1H', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use lambda function instead of directly applying it to groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1)
vital_col = vital_resampled.columns.drop(['icustay_id', 'charttime'])
vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()


  vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()


In [14]:
lab_resampled = lab_filtered.copy()
# Cut out minutes and hours, so that the resampling of the 8h takes the same time span as the 1h samples (for vitals)
lab_resampled = lab_resampled.assign(charttime=lab_resampled.charttime.dt.round('H'))
# Resample from the end of the time series 
#lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="end").median().drop(['icustay_id'], axis = 1).reset_index()
lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use transform instead of direct groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1 or 0 or mean!?)
lab_col = lab_resampled.columns.drop(['icustay_id', 'charttime'])
lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()

print(lab_resampled.isnull().sum().sum())

  lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()


730


In [15]:
delta_t_data = timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
demographics_windowed = demographics_filtered.copy()
demographics_windowed['predtime'] = demographics_windowed.intime + delta_t_data
demographics_windowed['delta_t_pred'] = demographics_windowed.outtime - demographics_windowed.predtime

demographics_windowed[['subject_id', 'icustay_id', 'intime', 'predtime', 'delta_t_pred']].head(5)

Unnamed: 0,subject_id,icustay_id,intime,predtime,delta_t_pred
0,334,214236,2136-01-16 10:56:48,2136-01-20 10:56:48,10 days 07:21:18
1,2005,285731,2163-06-23 11:28:06,2163-06-27 11:28:06,5 days 08:45:56
2,12174,284866,2118-10-30 16:48:57,2118-11-03 16:48:57,13 days 00:44:12
3,13535,205010,2196-10-10 22:03:14,2196-10-14 22:03:14,88 days 19:52:36
4,21824,241223,2107-07-07 20:58:00,2107-07-11 20:58:00,31 days 15:33:00


In [16]:
cut_icustay_ids = pd.DataFrame(demographics_windowed['icustay_id'].unique(), columns=['icustay_id'])
print("Number of ICU stays: ", cut_icustay_ids['icustay_id'].count())

vitals_cut = vital_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in vitals_cut: ", vitals_cut['icustay_id'].nunique())

labs_cut = lab_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in labs_cut: ", labs_cut['icustay_id'].nunique())


Number of ICU stays:  8409
Number of ICU stays in vitals_cut:  8409
Number of ICU stays in labs_cut:  8409


In [17]:
vitals_windowed = vital_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
vitals_windowed = vitals_windowed[vitals_windowed.charttime < vitals_windowed.predtime]
print("Number of ICU stays in vitals_windowed: ", vitals_windowed['icustay_id'].nunique())

labs_windowed = lab_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
labs_windowed = labs_windowed[labs_windowed.charttime < labs_windowed.predtime]
print("Number of ICU stays in labs_windowed: ", labs_windowed['icustay_id'].nunique())

windowed_icustay_ids = pd.DataFrame(pd.concat([vitals_windowed['icustay_id'], labs_windowed['icustay_id']]).unique(), columns=['icustay_id'])
demographics_windowed = demographics_windowed.merge(windowed_icustay_ids, on='icustay_id', how='right')

Number of ICU stays in vitals_windowed:  8405
Number of ICU stays in labs_windowed:  8405


In [18]:
vitals_windowed['ckd'] = vitals_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [19]:
vitals_windowed =vitals_windowed.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(-1).reset_index()

In [20]:
labs_windowed['ckd'] = labs_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [21]:
print("Number of ICU stays demographics: ", demographics_windowed['icustay_id'].nunique())
print("Number of CKD demographics:")
dd = demographics_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

print("Number of ICU stays vitals: ", vitals_windowed['icustay_id'].nunique())
print("Number of CKD vitals:")
dd = vitals_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

print("Number of ICU stays labs: ", labs_windowed['icustay_id'].nunique())
print("Number of CKD labs:")
dd = labs_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

Number of ICU stays demographics:  8405
Number of CKD demographics:
0    7868
1     537
Name: ckd, dtype: int64
Number of ICU stays vitals:  8405
Number of CKD vitals:
0    7868
1     537
Name: ckd, dtype: int64
Number of ICU stays labs:  8405
Number of CKD labs:
0    7868
1     537
Name: ckd, dtype: int64


In [22]:
def aggregate_dataframe(df, groupby_key, columns_to_aggregate):
    df = df.replace(-1, np.nan)
    result = df.groupby(groupby_key)[columns_to_aggregate].mean().reset_index()    
    return result

In [23]:
columns_to_merge = ['icustay_id', 'ckd','ethnicity_grouped']
df_cols_vitals = ['heartrate', 'sysbp','diasbp','meanbp','resprate','tempc','spo2','specificgravity','pedaledema','appetite_median']
df_agg_vitals = aggregate_dataframe(vitals_windowed, 'icustay_id', df_cols_vitals)
df_agg_vitals = df_agg_vitals.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_vitals['ckd_ethnicity'] = df_agg_vitals['ckd'].astype(str).str.cat(df_agg_vitals['ethnicity_grouped'].astype(str))

df_cols_labs = ['albumin','bacteria','glucose','bun','creatinine','sodium','potassium','hemoglobin','wbc','hematocrit','platelet','ptt']
df_agg_labs = aggregate_dataframe(labs_windowed, 'icustay_id', df_cols_labs)
df_agg_labs = df_agg_labs.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_labs['ckd_ethnicity'] = df_agg_labs['ckd'].astype(str).str.cat(df_agg_labs['ethnicity_grouped'].astype(str))

print("Vitals unique icustay id: ",len(df_agg_vitals['icustay_id'].unique()),"\nLabs unique icustay id: ",len(df_agg_labs['icustay_id'].unique()),"\nDemographics unique icustay id: ",len(demographics_windowed['icustay_id'].unique()))

Vitals unique icustay id:  8405 
Labs unique icustay id:  8405 
Demographics unique icustay id:  8405


In [24]:
df_agg_vitals_new=df_agg_vitals.drop(['ckd','ethnicity_grouped','ckd_ethnicity','pedaledema'],axis=1)
df_agg_labs_new=df_agg_labs.drop(['ckd','ethnicity_grouped','ckd_ethnicity'],axis=1)

In [25]:
merged_table_org = df_agg_labs_new.merge(df_agg_vitals_new, on='icustay_id', how='inner').merge(demographics_windowed, on='icustay_id', how='inner')

In [26]:
merged_table =merged_table_org.copy()

Table names : 
- demographics_windowed
- labs_windowed
- vitals_windowed
- df_agg_vitals
- df_agg_labs
- merged_table_org

In [27]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Filter the icustay_id where the difference is grater than or equal to Window_length in both labs and vitals
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=WINDOW_LENGTH)) & (vitals_diff == pd.Timedelta(hours=WINDOW_LENGTH))].index.tolist()

# Print the length of icustay_id
print(len(filtered_icustay_ids))

3038


In [28]:
merged_table_filtered= merged_table[merged_table['icustay_id'].isin(filtered_icustay_ids)]

In [29]:
merged_table_filtered.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'subject_id', 'hadm_id',
       'gender', 'dod', 'admittime', 'dischtime', 'los_hospital',
       'admission_age', 'ethnicity', 'ethnicity_grouped',
       'hospital_expire_flag', 'hospstay_seq', 'first_hosp_stay', 'intime',
       'outtime', 'los_icu', 'icustay_seq', 'first_icu_stay_current_hosp',
       'first_icu_stay_patient', 'first_careunit', 'deathtime_icu',
       'label_death_icu', 'label_cor_art', 'diabetes_mellitus', 'ckd',
       'anemia_flag', 'predtime', 'delta_t_pred'],
      dtype='object')

In [30]:
merged_table_filtered=merged_table_filtered.drop(['subject_id','hadm_id','dod','admittime', 'dischtime','los_hospital','ethnicity','hospital_expire_flag','hospstay_seq', 'first_hosp_stay', 'intime','outtime', 'los_icu', 'icustay_seq', 'first_icu_stay_current_hosp','first_icu_stay_patient', 'first_careunit', 'deathtime_icu','label_death_icu', 'predtime', 'delta_t_pred'],axis=1)

In [31]:
age_ranges = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 400]
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']
merged_table_filtered['age_group'] = pd.cut(merged_table_filtered['admission_age'], bins=age_ranges, labels=age_labels, right=False)
merged_table_filtered=merged_table_filtered.drop('admission_age',axis=1)

In [32]:
def evaluationCV(classifier,X, y):    
    cv_scores_pr = cross_val_score(classifier, X, y, cv=5, scoring='precision')    
    cv_scores_rc = cross_val_score(classifier, X, y, cv=5, scoring='recall')    
    cv_scores_f1 = cross_val_score(classifier,X, y, cv=5, scoring='f1')
    cv_scores_ac = cross_val_score(classifier, X, y, cv=5, scoring='accuracy') 
    
    print("Cross-validation scores Precision    :", cv_scores_pr)    
    print("Cross-validation scores Recall       :", cv_scores_rc)
    print("Cross-validation scores F1           :", cv_scores_f1)
    print("Cross-validation scores Accuracy     :", cv_scores_ac)
    
    print("Mean cross-validation score Precision:", np.mean(cv_scores_pr))
    print("Mean cross-validation score Recall   :", np.mean(cv_scores_rc))
    print("Mean cross-validation score F1       :", np.mean(cv_scores_f1))
    print("Mean cross-validation score Accuracy :", np.mean(cv_scores_ac))

In [33]:
def evaluationTest(classifier,X, y):  
    y_pred = classifier.predict(X)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    return pd.DataFrame(y_pred)    

In [34]:
def metricsReport(y,y_pred):
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    return precision, recall , f1, accuracy

#### Data & Class separation

In [35]:
X = merged_table_filtered.drop(['ckd','icustay_id'],axis=1)
y = merged_table_filtered['ckd']

#### Trial 1:  Random forest for static + aggregated timeseries

In [36]:
X_onehot = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

In [37]:
y_train.value_counts()

0    2305
1     125
Name: ckd, dtype: int64

In [38]:
y_test.value_counts()

0    577
1     31
Name: ckd, dtype: int64

In [39]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [40]:
param_grid_rcv = {
    'n_estimators': randint(50, 500),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : randint(1, 10),
    'criterion' :['gini', 'entropy']
}

In [41]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2','auto']
}

In [42]:
#runder = RandomUnderSampler(random_state=42)
#X_resampled, y_resampled = runder.fit_resample(X_train, y_train)
rf_merged = RandomForestClassifier(random_state=42)

#grid_search = RandomizedSearchCV(estimator=rf_merged, param_distributions=param_grid_rcv, n_iter=100, cv=5, random_state=42)

#grid_search = GridSearchCV(rf_merged, param_grid, cv=5)
#grid_search.fit(X_train, y_train)
#grid_search.best_params_

In [43]:
evaluationCV(rf_merged,X_train, y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores Precision    : [0. 0. 0. 0. 0.]
Cross-validation scores Recall       : [0. 0. 0. 0. 0.]
Cross-validation scores F1           : [0. 0. 0. 0. 0.]
Cross-validation scores Accuracy     : [0.94855967 0.94444444 0.94855967 0.94855967 0.94650206]
Mean cross-validation score Precision: 0.0
Mean cross-validation score Recall   : 0.0
Mean cross-validation score F1       : 0.0
Mean cross-validation score Accuracy : 0.9473251028806585


In [44]:
rf_merged_2 = RandomForestClassifier(random_state=42,
                                     max_depth = None, 
                                     max_features = 'sqrt',
                                     min_samples_leaf = 1,
                                     min_samples_split = 1,
                                     n_estimators= 100)

rf_merged_2.fit(X_train, y_train)

In [45]:
evaluationCV(rf_merged_2,X_train, y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores Precision    : [0. 0. 0. 0. 0.]
Cross-validation scores Recall       : [0. 0. 0. 0. 0.]
Cross-validation scores F1           : [0. 0. 0. 0. 0.]
Cross-validation scores Accuracy     : [0.94855967 0.94444444 0.94855967 0.94855967 0.94650206]
Mean cross-validation score Precision: 0.0
Mean cross-validation score Recall   : 0.0
Mean cross-validation score F1       : 0.0
Mean cross-validation score Accuracy : 0.9473251028806585


In [46]:
y_pred_merged_2 = evaluationTest(rf_merged_2,X_test, y_test)

Precision: 0.5
Recall: 0.03225806451612903
F1 Score: 0.06060606060606061
Accuracy: 0.9490131578947368


In [47]:
y_pred_merged_2.value_counts()

0    606
1      2
dtype: int64

#### Trial 2 - RandomUnderSampler

In [48]:
X_top =X[['creatinine', 'specificgravity', 'heartrate', 'bun', 'spo2', 'tempc', 'platelet', 'diasbp', 'bacteria', 'meanbp']]
X_onehot = pd.get_dummies(X_top)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

undersampler = RandomUnderSampler()
#undersampler = RandomUnderSampler(sampling_strategy={0: 250})
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

rf_ros = RandomForestClassifier(random_state=42,
                                max_depth = None, 
                                max_features = 'sqrt',
                                min_samples_leaf = 1,
                                min_samples_split = 1,
                                n_estimators= 100)

rf_ros.fit(X_resampled, y_resampled)
evaluationCV(rf_ros,X_resampled, y_resampled)

rf_ros.fit(X_resampled, y_resampled)
y_pred_ros = evaluationTest(rf_ros,X_test, y_test)
y_pred_ros.value_counts()

Cross-validation scores Precision    : [0.68965517 0.63636364 0.82758621 0.76923077 0.84210526]
Cross-validation scores Recall       : [0.8  0.84 0.96 0.8  0.64]
Cross-validation scores F1           : [0.74074074 0.72413793 0.88888889 0.78431373 0.72727273]
Cross-validation scores Accuracy     : [0.72 0.68 0.88 0.78 0.76]
Mean cross-validation score Precision: 0.752988209612529
Mean cross-validation score Recall   : 0.808
Mean cross-validation score F1       : 0.7730708026854071
Mean cross-validation score Accuracy : 0.7639999999999999
Precision: 0.15730337078651685
Recall: 0.9032258064516129
F1 Score: 0.2679425837320574
Accuracy: 0.7483552631578947


0    430
1    178
dtype: int64

In [49]:
cm = confusion_matrix(y_test, y_pred_ros)
print(cm)

[[427 150]
 [  3  28]]


In [50]:
X_onehot = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

undersampler = RandomUnderSampler()
#undersampler = RandomUnderSampler(sampling_strategy={0: 250})
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

rf_ros = RandomForestClassifier(random_state=42,
                                max_depth = None, 
                                max_features = 'sqrt',
                                min_samples_leaf = 1,
                                min_samples_split = 1,
                                n_estimators= 100)

rf_ros.fit(X_resampled, y_resampled)
evaluationCV(rf_ros,X_resampled, y_resampled)

rf_ros.fit(X_resampled, y_resampled)
y_pred_ros = evaluationTest(rf_ros,X_test, y_test)
y_pred_ros.value_counts()

Cross-validation scores Precision    : [0.60606061 0.79310345 0.72413793 0.82608696 0.65384615]
Cross-validation scores Recall       : [0.8  0.92 0.84 0.76 0.68]
Cross-validation scores F1           : [0.68965517 0.85185185 0.77777778 0.79166667 0.66666667]
Cross-validation scores Accuracy     : [0.64 0.84 0.76 0.8  0.66]
Mean cross-validation score Precision: 0.7206470191477687
Mean cross-validation score Recall   : 0.8
Mean cross-validation score F1       : 0.7555236270753511
Mean cross-validation score Accuracy : 0.74
Precision: 0.15384615384615385
Recall: 0.967741935483871
F1 Score: 0.26548672566371684
Accuracy: 0.7269736842105263


0    413
1    195
dtype: int64

#### Trial 3: BRF

In [51]:
X_onehot = pd.get_dummies(X_top)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

ros = SMOTE(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

brf = BalancedRandomForestClassifier(n_estimators=100)
brf.fit(X_resampled, y_resampled)

evaluationCV(brf,X_resampled, y_resampled)

y_pred_brf = evaluationTest(brf,X_test, y_test)
y_pred_brf.value_counts()















































































































































































































































Cross-validation scores Precision    : [0.93722944 0.94594595 0.95147679 0.94045175 0.9443299 ]
Cross-validation scores Recall       : [0.92624729 0.99349241 0.97830803 0.98698482 0.98915401]
Cross-validation scores F1           : [0.93275488 0.96698616 0.96493092 0.96398305 0.96515312]
Cross-validation scores Accuracy     : [0.93058568 0.96420824 0.96420824 0.96854664 0.96420824]
Mean cross-validation score Precision: 0.9438867637422843
Mean cross-validation score Recall   : 0.9748373101952279
Mean cross-validation score F1       : 0.9587616253349657
Mean cross-validation score Accuracy : 0.958351409978308
Precision: 0.25
Recall: 0.3225806451612903
F1 Score: 0.28169014084507044
Accuracy: 0.9161184210526315


0    568
1     40
dtype: int64

In [52]:
X_onehot = pd.get_dummies(X_top)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

undersampler = RandomUnderSampler()
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

brf = BalancedRandomForestClassifier(n_estimators=100)
brf.fit(X_resampled, y_resampled)

evaluationCV(brf,X_resampled, y_resampled)

y_pred_brf = evaluationTest(brf,X_test, y_test)
y_pred_brf.value_counts()

































































Cross-validation scores Precision    : [0.80769231 0.80769231 0.73076923 0.67857143 0.75      ]
Cross-validation scores Recall       : [0.88 0.8  0.76 0.84 0.72]
Cross-validation scores F1           : [0.83018868 0.80769231 0.74509804 0.72727273 0.70588235]
Cross-validation scores Accuracy     : [0.82 0.86 0.76 0.74 0.7 ]
Mean cross-validation score Precision: 0.7549450549450549
Mean cross-validation score Recall   : 0.8
Mean cross-validation score F1       : 0.7632268212734361
Mean cross-validation score Accuracy : 0.776
Precision: 0.15675675675675677
Recall: 0.9354838709677419
F1 Score: 0.2685185185185185
Accuracy: 0.7401315789473685




0    423
1    185
dtype: int64

# Multimodal

demographics_windowed

merged_table

--------------------------------
static_demo_comorb

labs_windowed

vitals_windowed

In [54]:
age_ranges = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 400]
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']
demographics_windowed['age_group'] = pd.cut(merged_table['admission_age'], bins=age_ranges, labels=age_labels, right=False)

In [55]:
static_demo_comorb = demographics_windowed[['icustay_id','gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus', 'anemia_flag', 'age_group', 'ckd']]

In [56]:
# Get the unique icustay_id values from each DataFrame
icustay_id_df1 = set(static_demo_comorb['icustay_id'])
icustay_id_df2 = set(labs_windowed['icustay_id'])
icustay_id_df3 = set(vitals_windowed['icustay_id'])

# Check for missing icustay_id values
missing_from_df1 = icustay_id_df2.union(icustay_id_df3) - icustay_id_df1
missing_from_df2 = icustay_id_df1.union(icustay_id_df3) - icustay_id_df2
missing_from_df3 = icustay_id_df1.union(icustay_id_df2) - icustay_id_df3

# Print the missing icustay_id values
if missing_from_df1:
    print(f"Icustay_id missing from static_demo_comorb: {missing_from_df1}")
else:
    print("No icustay_id missing from static_demo_comorb")

if missing_from_df2:
    print(f"Icustay_id missing from labs_windowed: {missing_from_df2}")
else:
    print("No icustay_id missing from labs_windowed")

if missing_from_df3:
    print(f"Icustay_id missing from vitals_windowed: {missing_from_df3}")
else:
    print("No icustay_id missing from vitals_windowed")



No icustay_id missing from static_demo_comorb
No icustay_id missing from labs_windowed
No icustay_id missing from vitals_windowed


In [57]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min()).to_frame()
labs_diff.min()

charttime   2 days 16:00:00
dtype: timedelta64[ns]

In [58]:
# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min()).to_frame()
vitals_diff.min()

charttime   2 days 23:00:00
dtype: timedelta64[ns]

In [59]:
WINDOW_LENGTH_NEW = 96

In [60]:
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=WINDOW_LENGTH_NEW)) & (vitals_diff == pd.Timedelta(hours=WINDOW_LENGTH_NEW))].index.tolist()
print("Total : ",len(filtered_icustay_ids))
print(static_demo_comorb['ckd'].value_counts())

Total :  8405
0    7868
1     537
Name: ckd, dtype: int64


## Random Forest - Comorbidity & Demographics

In [61]:
def RandomForestForMulti(X,y):    
    X_onehot=pd.get_dummies(X)
    X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2,stratify = y, random_state=42)
    
    rf_static_demo_comorb = RandomForestClassifier(n_estimators=300, random_state=42)
    
    #{'max_depth': None,
    # 'max_features': 'sqrt',
    # 'min_samples_leaf': 2,
    # 'min_samples_split': 2,
    # 'n_estimators': 200}
    
    rf_static_demo_comorb_best = RandomForestClassifier(n_estimators=200, 
                                                        max_depth=None,
                                                        min_samples_leaf=2,
                                                        min_samples_split=2,
                                                        max_features='sqrt',
                                                        random_state=42)
    
    rf_static_demo_comorb_best.fit(X_train, y_train)
    evaluationCV(rf_static_demo_comorb_best,X_train, y_train)
    
    y_pred = evaluationTest(rf_static_demo_comorb_best,X_test, y_test)
    print(y_pred.value_counts())

In [62]:
#static_demo_comorb_64 = static_demo_comorb[static_demo_comorb['icustay_id'].isin(filtered_icustay_ids)]

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

X=static_demo_comorb.drop(['ckd','icustay_id'],axis=1)
y=static_demo_comorb['ckd']

RandomForestForMulti(X,y)

  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores Precision    : [0. 1. 1. 0. 1.]
Cross-validation scores Recall       : [0.         0.01162791 0.02325581 0.         0.02325581]
Cross-validation scores F1           : [0.         0.02298851 0.04545455 0.         0.04545455]
Cross-validation scores Accuracy     : [0.93457249 0.93680297 0.93754647 0.93605948 0.9375    ]
Mean cross-validation score Precision: 0.6
Mean cross-validation score Recall   : 0.011627906976744186
Mean cross-validation score F1       : 0.022779519331243465
Mean cross-validation score Accuracy : 0.9364962825278811
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9363474122546104
0    1681
dtype: int64


  _warn_prf(average, modifier, msg_start, len(result))


#### Results with Comorbidity & Demographics
Mean cross-validation score Precision: 0.6

Mean cross-validation score Recall   : 0.011627906976744186

Mean cross-validation score F1       : 0.022779519331243465

Mean cross-validation score Accuracy : 0.9364962825278811

Precision: 0.0

Recall: 0.0

F1 Score: 0.0

Accuracy: 0.9363474122546104

In [63]:
X=static_demo_comorb.drop(['ckd','icustay_id'],axis=1)
y=static_demo_comorb['ckd']

undersampler = RandomUnderSampler()
X, y = undersampler.fit_resample(X, y)

RandomForestForMulti(X,y)

Cross-validation scores Precision    : [0.7260274  0.6091954  0.68571429 0.63076923 0.62711864]
Cross-validation scores Recall       : [0.61627907 0.61627907 0.55813953 0.47674419 0.43023256]
Cross-validation scores F1           : [0.66666667 0.61271676 0.61538462 0.54304636 0.51034483]
Cross-validation scores Accuracy     : [0.69186047 0.61046512 0.65116279 0.59883721 0.58479532]
Mean cross-validation score Precision: 0.6557649920220876
Mean cross-validation score Recall   : 0.5395348837209302
Mean cross-validation score F1       : 0.5896318460518327
Mean cross-validation score Accuracy : 0.6274241806065552
Precision: 0.717948717948718
Recall: 0.5233644859813084
F1 Score: 0.6054054054054053
Accuracy: 0.6604651162790698
0    137
1     78
dtype: int64


#### Results with undersampled - Demographics & Comorbidity

Mean cross-validation score Precision: 0.6155532512346373

Mean cross-validation score Recall   : 0.5813953488372092

Mean cross-validation score F1       : 0.5960233366420532

Mean cross-validation score Accuracy : 0.6110839113287094

Precision: 0.6170212765957447

Recall: 0.5420560747663551

F1 Score: 0.5771144278606966

Accuracy: 0.6046511627906976

## Time series - Data preparation

In [64]:
def print_unique_shape(grouped_data,feature_cols):
    previous_shape = []
    for _, group in grouped_data:
        group_values = group[feature_cols].values.T
        if group_values.shape not in previous_shape:        
            print(group_values.shape)
            previous_shape.append(group_values.shape)

In [65]:
def check_missing_and_extras(data_windowed,feature_cols, threshold):
    data_windowed_new = data_windowed.copy()
    df_counts = data_windowed_new.groupby('icustay_id').count()
    icustay_ids_more_records = df_counts[df_counts['charttime'] > threshold].index
    icustay_ids_less_records = df_counts[df_counts['charttime'] < threshold].index
    previous_shape = []
    
    for icustay_id in icustay_ids_more_records:
        df_grouped = data_windowed_new[data_windowed_new['icustay_id'] == icustay_id]
        if df_grouped.shape not in previous_shape:        
            print(f"There are records with more than {threshold} readings : {df_grouped.shape}")
            previous_shape.append(df_grouped.shape)
        
        # Check if the time span is more than 4 days
        if (df_grouped['charttime'].max() - df_grouped['charttime'].min()).days > 4:
            print(f"icustay_id: {icustay_id} has a time span of more than 4 days.")
        
        # Check for duplicate records
        if df_grouped.duplicated().sum() > 0:
            print(f"icustay_id: {icustay_id} has {df_grouped.duplicated().sum()} duplicate records.")
            
    for icustay_id in icustay_ids_less_records:
        print(f"icustay_id: {icustay_id} has a time span less than {threshold} records")


In [66]:
def backward_forward_fill(data_windowed, time_interval, threshold, feature_cols):    
    data_windowed_new = data_windowed.copy()
    df_counts = data_windowed_new.groupby('icustay_id').count()
    icustay_ids_fewer_records = df_counts[df_counts['charttime'] < threshold].index
    
    def resample_and_fill(df_grouped):
        df_grouped = df_grouped.set_index('charttime').resample(time_interval).ffill().bfill()
        return df_grouped
    
    for icustay_id in icustay_ids_fewer_records:
        df_grouped = data_windowed_new[data_windowed_new['icustay_id'] == icustay_id]
        df_filled = resample_and_fill(df_grouped)
        data_windowed_new.loc[data_windowed_new['icustay_id'] == icustay_id] = df_filled.reset_index()
        
    data_windowed_new['charttime'] = pd.to_datetime(data_windowed_new['charttime'])                  
    data_windowed_new.sort_values(['icustay_id', 'charttime'])
    data_grouped_new = data_windowed_new[['icustay_id'] + feature_cols + ['ckd']].groupby(['icustay_id']).head(threshold)
    
    data_grouped_new = data_grouped_new[['icustay_id'] + feature_cols + ['ckd']].groupby(['icustay_id'])
    
    return data_grouped_new

### Labs

In [67]:
feature_labs= ['aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']

labs_windowed['charttime'] = pd.to_datetime(labs_windowed['charttime'])                  
labs_windowed.sort_values(['icustay_id', 'charttime'])
#labs_grouped_for_check = labs_windowed[['icustay_id'] + feature_labs + ['ckd']].groupby(['icustay_id'])

Unnamed: 0,icustay_id,charttime,subject_id,aniongap,albumin,bands,bicarbonate,bilirubin,creatinine,chloride,...,ptt,inr,pt,sodium,bun,wbc,bacteria,ckd,predtime,delta_t_pred
185517,200017,2138-03-17 22:00:00,15909.0,19.0,2.6,1.0,19.0,4.6,1.0,111.0,...,36.0,1.3,14.5,144.0,29.0,5.2,2.0,0,2138-03-21 21:54:36,53 days 19:18:05
185518,200017,2138-03-18 06:00:00,15909.0,19.0,2.6,1.0,19.0,4.6,1.0,111.0,...,36.0,1.3,14.5,144.0,29.0,5.2,2.0,0,2138-03-21 21:54:36,53 days 19:18:05
185519,200017,2138-03-18 14:00:00,15909.0,19.0,2.6,1.0,19.0,4.6,1.0,111.0,...,36.0,1.3,14.5,144.0,29.0,5.2,2.0,0,2138-03-21 21:54:36,53 days 19:18:05
185520,200017,2138-03-18 22:00:00,15909.0,19.0,2.6,1.0,19.0,4.6,1.0,111.0,...,36.0,1.3,14.5,144.0,29.0,5.2,2.0,0,2138-03-21 21:54:36,53 days 19:18:05
185521,200017,2138-03-19 06:00:00,15909.0,18.0,2.6,1.0,22.0,5.2,1.0,107.0,...,36.0,1.3,14.5,142.0,29.0,5.2,2.0,0,2138-03-21 21:54:36,53 days 19:18:05
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
464589,299992,2105-11-24 06:00:00,13083.0,13.0,2.6,3.0,25.0,0.3,0.6,112.0,...,25.1,1.0,12.3,146.0,10.0,12.2,1.0,0,2105-11-25 21:14:34,16 days 19:31:36
464590,299992,2105-11-24 14:00:00,13083.0,13.0,2.6,3.0,25.0,0.3,0.6,112.0,...,25.1,1.0,12.3,146.0,10.0,12.2,1.0,0,2105-11-25 21:14:34,16 days 19:31:36
464591,299992,2105-11-24 22:00:00,13083.0,15.0,2.6,3.0,26.0,0.3,0.6,109.0,...,22.8,1.0,12.3,146.0,11.0,8.7,1.0,0,2105-11-25 21:14:34,16 days 19:31:36
464592,299992,2105-11-25 06:00:00,13083.0,15.0,2.6,3.0,26.0,0.3,0.6,109.0,...,22.8,1.0,12.3,146.0,11.0,8.7,1.0,0,2105-11-25 21:14:34,16 days 19:31:36


In [68]:
check_missing_and_extras(labs_windowed,feature_labs, 12)

There are records with more than 12 readings : (13, 26)
There are records with more than 12 readings : (14, 26)
icustay_id: 205858 has a time span less than 12 records
icustay_id: 208605 has a time span less than 12 records
icustay_id: 213950 has a time span less than 12 records
icustay_id: 218035 has a time span less than 12 records
icustay_id: 219817 has a time span less than 12 records
icustay_id: 222223 has a time span less than 12 records
icustay_id: 238595 has a time span less than 12 records
icustay_id: 239959 has a time span less than 12 records
icustay_id: 242499 has a time span less than 12 records
icustay_id: 242950 has a time span less than 12 records
icustay_id: 250732 has a time span less than 12 records
icustay_id: 252983 has a time span less than 12 records
icustay_id: 264963 has a time span less than 12 records
icustay_id: 268945 has a time span less than 12 records
icustay_id: 286918 has a time span less than 12 records
icustay_id: 286937 has a time span less than 12 

In [69]:
labs_windowed_filled = backward_forward_fill(labs_windowed, '8H', 12, feature_labs)

In [70]:
print_unique_shape(labs_windowed_filled,feature_labs)

  for _, group in grouped_data:


(20, 12)


### Vitals

In [71]:
feature_vitals = ['heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 
                  'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity', 'appetite_median']

vitals_windowed['charttime'] = pd.to_datetime(vitals_windowed['charttime']) 
vitals_windowed.sort_values(['icustay_id', 'charttime','ckd'])
vital_grouped = vitals_windowed[['icustay_id'] + feature_vitals + ['ckd']].groupby(['icustay_id'])

In [72]:
check_missing_and_extras(vitals_windowed,feature_vitals, 96)

There are records with more than 96 readings : (98, 15)
There are records with more than 96 readings : (97, 15)
There are records with more than 96 readings : (100, 15)
There are records with more than 96 readings : (99, 15)
There are records with more than 96 readings : (106, 15)
There are records with more than 96 readings : (105, 15)
There are records with more than 96 readings : (108, 15)
There are records with more than 96 readings : (104, 15)
There are records with more than 96 readings : (102, 15)
There are records with more than 96 readings : (101, 15)
There are records with more than 96 readings : (107, 15)
There are records with more than 96 readings : (103, 15)
icustay_id: 200033 has a time span less than 96 records
icustay_id: 200103 has a time span less than 96 records
icustay_id: 200191 has a time span less than 96 records
icustay_id: 200325 has a time span less than 96 records
icustay_id: 200487 has a time span less than 96 records
icustay_id: 200488 has a time span less

In [73]:
vitals_windowed_new = backward_forward_fill(vitals_windowed, '1H', 96, feature_vitals)

In [74]:
print_unique_shape(vitals_windowed_new,feature_vitals)

  for _, group in grouped_data:


(11, 96)


## Rocket - Time series model

In [94]:
def RocketMulti(grouped_data,feature_columns,resampling=False):     
    X = []
    y = []
    for _, group in grouped_data:
        group_values = group[feature_columns].values.T
        num_timestamps = group_values.shape[1]
        X.append(group_values)
        y.append(group['ckd'].iloc[0])
          
    X = np.array(X)
    y = np.array(y)
    
    n_samples, n_features, n_channels = X.shape
    X_2d = X.reshape((n_samples, n_features*n_channels))
    
    X_train_2d, X_test_2d, y_train, y_test = train_test_split(X_2d, y, test_size=0.2, stratify=y)
    
    if (resampling == True):
        undersampler = RandomUnderSampler()
        X_train_2d, y_train = undersampler.fit_resample(X_train_2d, y_train)
    
    X_train = X_train_2d.reshape((X_train_2d.shape[0], n_features, n_channels))
    X_test = X_test_2d.reshape((X_test_2d.shape[0], n_features, n_channels))

    print("X_train shape: ",X_train.shape,"\ny_train shape: ",y_train.shape)
    
    rocket = Rocket(num_kernels=100, random_state=42)
    rocket.fit(X_train)
    
    X_train_transformed = rocket.transform(X_train)
    X_test_transformed = rocket.transform(X_test)

    # Reshape transformed data back into 2D for logistic regression
    X_train_transformed_2d = X_train_transformed.values.reshape((X_train_transformed.shape[0], -1))
    X_test_transformed_2d = X_test_transformed.values.reshape((X_test_transformed.shape[0], -1))
    
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train_transformed_2d, y_train)
    
    evaluationCV(clf,X_train_transformed_2d, y_train)
    y_pred = evaluationTest(clf,X_test_transformed_2d, y_test)
    
    return y_pred


In [95]:
y_pred_labs_1 = RocketMulti(labs_windowed_filled,feature_labs)

  for _, group in grouped_data:


X_train shape:  (6710, 20, 12) 
y_train shape:  (6710,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores Precision    : [0.33333333 0.16666667 0.         0.         0.        ]
Cross-validation scores Recall       : [0.01176471 0.01176471 0.         0.         0.        ]
Cross-validation scores F1           : [0.02272727 0.02197802 0.         0.         0.        ]
Cross-validation scores Accuracy     : [0.93591654 0.93368107 0.93442623 0.93293592 0.93442623]
Mean cross-validation score Precision: 0.1
Mean cross-validation score Recall   : 0.004705882352941176
Mean cross-validation score F1       : 0.00894105894105894
Mean cross-validation score Accuracy : 0.9342771982116245
Precision: 0.6666666666666666
Recall: 0.018691588785046728
F1 Score: 0.03636363636363636
Accuracy: 0.9368295589988082


In [96]:
y_pred_labs_2 = RocketMulti(labs_windowed_filled,feature_labs, resampling = True)

  for _, group in grouped_data:


X_train shape:  (856, 20, 12) 
y_train shape:  (856,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores Precision    : [0.62068966 0.5890411  0.59090909 0.55913978 0.52631579]
Cross-validation scores Recall       : [0.62790698 0.50588235 0.61176471 0.60465116 0.46511628]
Cross-validation scores F1           : [0.62427746 0.5443038  0.60115607 0.58100559 0.49382716]
Cross-validation scores Accuracy     : [0.62209302 0.57894737 0.59649123 0.56140351 0.52046784]
Mean cross-validation score Precision: 0.5772190832783672
Mean cross-validation score Recall   : 0.5630642954856361
Mean cross-validation score F1       : 0.5689140141131842
Mean cross-validation score Accuracy : 0.5758805929552564
Precision: 0.09452054794520548
Recall: 0.6448598130841121
F1 Score: 0.16487455197132617
Accuracy: 0.5834326579261025


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [97]:
y_pred_vitals_1 = RocketMulti(vitals_windowed_new,feature_vitals)

  for _, group in grouped_data:


X_train shape:  (5792, 11, 96) 
y_train shape:  (5792,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please als

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores Precision    : [0. 0. 1. 0. 0.]
Cross-validation scores Recall       : [0.         0.         0.01538462 0.         0.        ]
Cross-validation scores F1           : [0.         0.         0.03030303 0.         0.        ]
Cross-validation scores Accuracy     : [0.94305436 0.94305436 0.9447323  0.94386874 0.94214162]
Mean cross-validation score Precision: 0.2
Mean cross-validation score Recall   : 0.003076923076923077
Mean cross-validation score F1       : 0.0060606060606060615
Mean cross-validation score Accuracy : 0.9433702748334353
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9434092477570738


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, msg_start, len(result))


In [98]:
y_pred_vitals_2 = RocketMulti(vitals_windowed_new,feature_vitals, resampling=True)

  for _, group in grouped_data:


X_train shape:  (654, 11, 96) 
y_train shape:  (654,)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-validation scores Precision    : [0.62666667 0.5625     0.64864865 0.67142857 0.63934426]
Cross-validation scores Recall       : [0.72307692 0.69230769 0.72727273 0.71212121 0.6       ]
Cross-validation scores F1           : [0.67142857 0.62068966 0.68571429 0.69117647 0.61904762]
Cross-validation scores Accuracy     : [0.64885496 0.58015267 0.66412214 0.67938931 0.63076923]
Mean cross-validation score Precision: 0.6297176298077938
Mean cross-validation score Recall   : 0.6909557109557108
Mean cross-validation score F1       : 0.6576113203902251
Mean cross-validation score Accuracy : 0.6406576629477393
Precision: 0.09384615384615384
Recall: 0.7439024390243902
F1 Score: 0.16666666666666666
Accuracy: 0.5790200138026225


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
