In [1]:
import psycopg2
from datetime import timedelta
from sqlalchemy import create_engine
import psycopg2
import pandas as pd
import numpy as np
import tensorflow as tf
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score, recall_score,f1_score, accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
from sktime.transformations.panel.rocket import Rocket
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import VotingClassifier
from sklearn.pipeline import Pipeline

from imblearn.ensemble import BalancedRandomForestClassifier
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler

In [2]:
import psycopg2

conn = psycopg2.connect(
    host="localhost",
    database="mimic",
    user="postgres",
    password="postgres"
)

cur = conn.cursor()

cur.execute("SELECT version();")
print(cur.fetchone())

('PostgreSQL 15.2, compiled by Visual C++ build 1914, 64-bit',)


In [3]:
# Connect to db
conn = psycopg2.connect(host='localhost', dbname='mimic', user='postgres', password='postgres', options='-c search_path=mimiciii')
cur = conn.cursor() 

# Read in table with patients & admissions (inner join on subject_id) and icu_stays (inner joinon subject_id and hadm_id)
icustay_details = pd.read_sql_query("SELECT * FROM mimiciii.flicu_icustay_detail;", conn)

# Read in vital and lab signs
pivoted_vital = pd.read_sql_query("SELECT * FROM mimiciii.pivoted_vital;", conn)
pivoted_lab = pd.read_sql_query("SELECT * FROM mimiciii.ckd_pivoted_lab;", conn)

# Close the cursor and connection to so the server can allocate bandwidth to other requests
cur.close()
conn.close()

  icustay_details = pd.read_sql_query("SELECT * FROM mimiciii.flicu_icustay_detail;", conn)
  pivoted_vital = pd.read_sql_query("SELECT * FROM mimiciii.pivoted_vital;", conn)
  pivoted_lab = pd.read_sql_query("SELECT * FROM mimiciii.ckd_pivoted_lab;", conn)


In [4]:
WINDOW_LENGTH = 96

In [5]:
data= icustay_details.copy()
data = data[data.los_icu >= WINDOW_LENGTH/24.0]

In [6]:
filtered_icustay_ids = pd.DataFrame(data['icustay_id'].unique(), columns=['icustay_id'])

In [7]:
# Drop measurements with no belonging icustay_id
pivoted_vital = pivoted_vital.dropna(subset=['icustay_id'])
pivoted_lab = pivoted_lab.dropna(subset=['icustay_id'])

# Cast icustay_id types to int
pivoted_vital['icustay_id'] = pivoted_vital['icustay_id'].astype(int)
pivoted_lab['icustay_id'] = pivoted_lab['icustay_id'].astype(int)

# Keep only values of patients in previously filtered icustay_ids in labs and vitals
pivoted_vital = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
pivoted_lab = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()

In [8]:
# Min of each lab and vitals
icustay_ids_charttime_min_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
icustay_ids_charttime_min_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]
# Min of both combined
icustay_ids_charttime_min_vital_lab = pd.concat([icustay_ids_charttime_min_lab, icustay_ids_charttime_min_vital], ignore_index=True)
icustay_ids_charttime_min_vital_lab = icustay_ids_charttime_min_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_min_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=1,method='dense') == 1]

# Max of each lab and vitals
icustay_ids_charttime_max_lab = pivoted_lab[["icustay_id", "charttime"]][pivoted_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
icustay_ids_charttime_max_vital = pivoted_vital[["icustay_id", "charttime"]][pivoted_vital.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]
# Max of both combined
icustay_ids_charttime_max_vital_lab = pd.concat([icustay_ids_charttime_max_lab, icustay_ids_charttime_max_vital], ignore_index=True)
icustay_ids_charttime_max_vital_lab = icustay_ids_charttime_max_vital_lab[["icustay_id", "charttime"]][icustay_ids_charttime_max_vital_lab.groupby("icustay_id")["charttime"].rank(ascending=0,method='dense') == 1]

In [9]:
# Find for which icustay_ids there exist at least WINDOW_LENGTH of data
icustay_ids_vital_lab_charttime_min_max = pd.concat([icustay_ids_charttime_max_vital_lab, icustay_ids_charttime_min_vital_lab], ignore_index=True)
time_window = timedelta(days=4, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
is_time_diff_bigger_window_lab = icustay_ids_vital_lab_charttime_min_max.groupby(['icustay_id'])['charttime'].transform(lambda x: (x.max()-x.min())) >= time_window

icustay_ids_vital_lab_charttime_min_max_filtered = icustay_ids_vital_lab_charttime_min_max[is_time_diff_bigger_window_lab]
print("Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering", icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].nunique())

# Keep only icustay ids for which at least WINDOW_LENGTH of data exists
icustay_ids_time_filtered = pd.DataFrame(icustay_ids_vital_lab_charttime_min_max_filtered['icustay_id'].unique(), columns=['icustay_id'])
print("Unique icu stays in icustay_ids_time_filtered: ", icustay_ids_time_filtered['icustay_id'].nunique())

Unique icu stays in icustay_ids_vital_lab_charttime_min_max_filtered after filtering 8409
Unique icu stays in icustay_ids_time_filtered:  8409


In [10]:
filtered_icustay_ids = filtered_icustay_ids.merge(icustay_ids_time_filtered, on='icustay_id', how='inner').drop_duplicates()

In [11]:
demographics_filtered = data.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays demographics: ", demographics_filtered['icustay_id'].nunique())

vital_filtered = pivoted_vital.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays vitals: ", vital_filtered['icustay_id'].nunique())

lab_filtered = pivoted_lab.merge(filtered_icustay_ids, on='icustay_id', how='right').drop_duplicates()
print("Number of ICU stays labs: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays demographics:  8409
Number of ICU stays vitals:  8409
Number of ICU stays labs:  8409


In [12]:
vital_filtered = vital_filtered.merge(lab_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", vital_filtered['icustay_id'].nunique())
lab_filtered = lab_filtered.merge(vital_filtered[['icustay_id', 'charttime']], on=['icustay_id', 'charttime'], how='outer').drop_duplicates()
print("Number of ICU stays in lab_filtered: ", lab_filtered['icustay_id'].nunique())

Number of ICU stays in lab_filtered:  8409
Number of ICU stays in lab_filtered:  8409


In [13]:
vital_resampled = vital_filtered.copy()

# Resample from the end of the time series (how="last")
vital_resampled = vital_resampled.assign(charttime=vital_resampled.charttime.dt.round('H'))

# Resample from the beginning of the time series
vital_resampled = vital_resampled.set_index('charttime').groupby('icustay_id').resample('1H', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use lambda function instead of directly applying it to groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1)
vital_col = vital_resampled.columns.drop(['icustay_id', 'charttime'])
vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()


  vital_resampled = vital_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(value=vital_resampled[['icustay_id', 'charttime', 'heartrate', 'sysbp', 'diasbp', 'meanbp','resprate', 'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity','pedaledema', 'appetite_median']].median()).reset_index()


In [14]:
lab_resampled = lab_filtered.copy()
# Cut out minutes and hours, so that the resampling of the 8h takes the same time span as the 1h samples (for vitals)
lab_resampled = lab_resampled.assign(charttime=lab_resampled.charttime.dt.round('H'))
# Resample from the end of the time series 
#lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="end").median().drop(['icustay_id'], axis = 1).reset_index()
lab_resampled = lab_resampled.set_index('charttime').groupby('icustay_id').resample('8h', origin="start").median().drop(['icustay_id'], axis = 1).reset_index()

# Forward and backwards fill (use transform instead of direct groupby otherwise results from one group are carreid forward to another group...BAD)
# Fill NaNs (-1 or 0 or mean!?)
lab_col = lab_resampled.columns.drop(['icustay_id', 'charttime'])
lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()

print(lab_resampled.isnull().sum().sum())

  lab_resampled = lab_resampled.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[lab_col].transform(lambda x: x.ffill().bfill()).fillna(value=lab_resampled[['icustay_id', 'subject_id', 'charttime', 'aniongap', 'albumin', 'bands','bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose','hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt','inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']].median()).reset_index()


730


In [15]:
delta_t_data = timedelta(days=0, seconds=0, microseconds=0, milliseconds=0, minutes=0, hours=WINDOW_LENGTH, weeks=0)
demographics_windowed = demographics_filtered.copy()
demographics_windowed['predtime'] = demographics_windowed.intime + delta_t_data
demographics_windowed['delta_t_pred'] = demographics_windowed.outtime - demographics_windowed.predtime

demographics_windowed[['subject_id', 'icustay_id', 'intime', 'predtime', 'delta_t_pred']].head(5)

Unnamed: 0,subject_id,icustay_id,intime,predtime,delta_t_pred
0,334,214236,2136-01-16 10:56:48,2136-01-20 10:56:48,10 days 07:21:18
1,2005,285731,2163-06-23 11:28:06,2163-06-27 11:28:06,5 days 08:45:56
2,12174,284866,2118-10-30 16:48:57,2118-11-03 16:48:57,13 days 00:44:12
3,13535,205010,2196-10-10 22:03:14,2196-10-14 22:03:14,88 days 19:52:36
4,21824,241223,2107-07-07 20:58:00,2107-07-11 20:58:00,31 days 15:33:00


In [16]:
cut_icustay_ids = pd.DataFrame(demographics_windowed['icustay_id'].unique(), columns=['icustay_id'])
print("Number of ICU stays: ", cut_icustay_ids['icustay_id'].count())

vitals_cut = vital_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in vitals_cut: ", vitals_cut['icustay_id'].nunique())

labs_cut = lab_resampled.merge(cut_icustay_ids, on='icustay_id', how='right')
print("Number of ICU stays in labs_cut: ", labs_cut['icustay_id'].nunique())


Number of ICU stays:  8409
Number of ICU stays in vitals_cut:  8409
Number of ICU stays in labs_cut:  8409


In [17]:
vitals_windowed = vital_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
vitals_windowed = vitals_windowed[vitals_windowed.charttime < vitals_windowed.predtime]
print("Number of ICU stays in vitals_windowed: ", vitals_windowed['icustay_id'].nunique())

labs_windowed = lab_resampled.merge(demographics_windowed[['icustay_id', 'predtime', 'delta_t_pred']], on='icustay_id', how='right')
labs_windowed = labs_windowed[labs_windowed.charttime < labs_windowed.predtime]
print("Number of ICU stays in labs_windowed: ", labs_windowed['icustay_id'].nunique())

windowed_icustay_ids = pd.DataFrame(pd.concat([vitals_windowed['icustay_id'], labs_windowed['icustay_id']]).unique(), columns=['icustay_id'])
demographics_windowed = demographics_windowed.merge(windowed_icustay_ids, on='icustay_id', how='right')

Number of ICU stays in vitals_windowed:  8405
Number of ICU stays in labs_windowed:  8405


In [18]:
vitals_windowed['ckd'] = vitals_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [19]:
vitals_windowed =vitals_windowed.set_index(['icustay_id', 'charttime']).groupby('icustay_id')[vital_col].transform(lambda x: x.ffill().bfill()).fillna(-1).reset_index()

In [20]:
labs_windowed['ckd'] = labs_windowed['icustay_id'].map(demographics_windowed.set_index('icustay_id')['ckd'])

In [21]:
print("Number of ICU stays demographics: ", demographics_windowed['icustay_id'].nunique())
print("Number of CKD demographics:")
dd = demographics_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

print("Number of ICU stays vitals: ", vitals_windowed['icustay_id'].nunique())
print("Number of CKD vitals:")
dd = vitals_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

print("Number of ICU stays labs: ", labs_windowed['icustay_id'].nunique())
print("Number of CKD labs:")
dd = labs_windowed[['icustay_id','ckd']].drop_duplicates(subset=['icustay_id'])
print(dd['ckd'].value_counts())

Number of ICU stays demographics:  8405
Number of CKD demographics:
0    7868
1     537
Name: ckd, dtype: int64
Number of ICU stays vitals:  8405
Number of CKD vitals:
0    7868
1     537
Name: ckd, dtype: int64
Number of ICU stays labs:  8405
Number of CKD labs:
0    7868
1     537
Name: ckd, dtype: int64


In [22]:
def aggregate_dataframe(df, groupby_key, columns_to_aggregate):
    df = df.replace(-1, np.nan)
    result = df.groupby(groupby_key)[columns_to_aggregate].mean().reset_index()    
    return result

In [23]:
columns_to_merge = ['icustay_id', 'ckd','ethnicity_grouped']
df_cols_vitals = ['heartrate', 'sysbp','diasbp','meanbp','resprate','tempc','spo2','specificgravity','pedaledema','appetite_median']
df_agg_vitals = aggregate_dataframe(vitals_windowed, 'icustay_id', df_cols_vitals)
df_agg_vitals = df_agg_vitals.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_vitals['ckd_ethnicity'] = df_agg_vitals['ckd'].astype(str).str.cat(df_agg_vitals['ethnicity_grouped'].astype(str))

df_cols_labs = ['albumin','bacteria','glucose','bun','creatinine','sodium','potassium','hemoglobin','wbc','hematocrit','platelet','ptt']
df_agg_labs = aggregate_dataframe(labs_windowed, 'icustay_id', df_cols_labs)
df_agg_labs = df_agg_labs.merge(demographics_windowed[columns_to_merge], on='icustay_id', how='inner')
df_agg_labs['ckd_ethnicity'] = df_agg_labs['ckd'].astype(str).str.cat(df_agg_labs['ethnicity_grouped'].astype(str))

print("Vitals unique icustay id: ",len(df_agg_vitals['icustay_id'].unique()),"\nLabs unique icustay id: ",len(df_agg_labs['icustay_id'].unique()),"\nDemographics unique icustay id: ",len(demographics_windowed['icustay_id'].unique()))

Vitals unique icustay id:  8405 
Labs unique icustay id:  8405 
Demographics unique icustay id:  8405


In [24]:
df_agg_vitals_new=df_agg_vitals.drop(['ckd','ethnicity_grouped','ckd_ethnicity','pedaledema'],axis=1)
df_agg_labs_new=df_agg_labs.drop(['ckd','ethnicity_grouped','ckd_ethnicity'],axis=1)

In [25]:
merged_table_org = df_agg_labs_new.merge(df_agg_vitals_new, on='icustay_id', how='inner').merge(demographics_windowed, on='icustay_id', how='inner')

In [26]:
merged_table =merged_table_org.copy()

Table names : 
- demographics_windowed
- labs_windowed
- vitals_windowed
- df_agg_vitals
- df_agg_labs
- merged_table_org

In [27]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min())

# Filter the icustay_id where the difference is grater than or equal to Window_length in both labs and vitals
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=WINDOW_LENGTH)) & (vitals_diff == pd.Timedelta(hours=WINDOW_LENGTH))].index.tolist()

# Print the length of icustay_id
print(len(filtered_icustay_ids))

3038


In [28]:
merged_table_filtered= merged_table[merged_table['icustay_id'].isin(filtered_icustay_ids)]

In [29]:
merged_table_filtered.columns

Index(['icustay_id', 'albumin', 'bacteria', 'glucose', 'bun', 'creatinine',
       'sodium', 'potassium', 'hemoglobin', 'wbc', 'hematocrit', 'platelet',
       'ptt', 'heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 'tempc',
       'spo2', 'specificgravity', 'appetite_median', 'subject_id', 'hadm_id',
       'gender', 'dod', 'admittime', 'dischtime', 'los_hospital',
       'admission_age', 'ethnicity', 'ethnicity_grouped',
       'hospital_expire_flag', 'hospstay_seq', 'first_hosp_stay', 'intime',
       'outtime', 'los_icu', 'icustay_seq', 'first_icu_stay_current_hosp',
       'first_icu_stay_patient', 'first_careunit', 'deathtime_icu',
       'label_death_icu', 'label_cor_art', 'diabetes_mellitus', 'ckd',
       'anemia_flag', 'predtime', 'delta_t_pred'],
      dtype='object')

In [30]:
merged_table_filtered=merged_table_filtered.drop(['subject_id','hadm_id','dod','admittime', 'dischtime','los_hospital','ethnicity','hospital_expire_flag','hospstay_seq', 'first_hosp_stay', 'intime','outtime', 'los_icu', 'icustay_seq', 'first_icu_stay_current_hosp','first_icu_stay_patient', 'first_careunit', 'deathtime_icu','label_death_icu', 'predtime', 'delta_t_pred'],axis=1)

In [31]:
age_ranges = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 400]
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']
merged_table_filtered['age_group'] = pd.cut(merged_table_filtered['admission_age'], bins=age_ranges, labels=age_labels, right=False)
merged_table_filtered=merged_table_filtered.drop('admission_age',axis=1)

In [32]:
def evaluationCV(classifier,X, y):    
    cv_scores_pr = cross_val_score(classifier, X, y, cv=5, scoring='precision')    
    cv_scores_rc = cross_val_score(classifier, X, y, cv=5, scoring='recall')    
    cv_scores_f1 = cross_val_score(classifier,X, y, cv=5, scoring='f1')
    cv_scores_ac = cross_val_score(classifier, X, y, cv=5, scoring='accuracy') 
    
    print("Cross-validation scores Precision    :", cv_scores_pr)    
    print("Cross-validation scores Recall       :", cv_scores_rc)
    print("Cross-validation scores F1           :", cv_scores_f1)
    print("Cross-validation scores Accuracy     :", cv_scores_ac)
    
    print("Mean cross-validation score Precision:", np.mean(cv_scores_pr))
    print("Mean cross-validation score Recall   :", np.mean(cv_scores_rc))
    print("Mean cross-validation score F1       :", np.mean(cv_scores_f1))
    print("Mean cross-validation score Accuracy :", np.mean(cv_scores_ac))

In [33]:
def evaluationTest(classifier,X, y):  
    y_pred = classifier.predict(X)
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    return pd.DataFrame(y_pred)    

In [34]:
def metricsReport(y,y_pred):
    precision = precision_score(y, y_pred)
    recall = recall_score(y, y_pred)
    f1 = f1_score(y, y_pred)
    accuracy = accuracy_score(y, y_pred)
    print("Precision:", precision)
    print("Recall:", recall)
    print("F1 Score:", f1)
    print("Accuracy:", accuracy)
    return precision, recall , f1, accuracy

#### Data & Class separation

In [35]:
X = merged_table_filtered.drop(['ckd','icustay_id'],axis=1)
y = merged_table_filtered['ckd']

#### Trial 1:  Random forest for static + aggregated timeseries

In [36]:
X_onehot = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

In [37]:
y_train.value_counts()

0    2305
1     125
Name: ckd, dtype: int64

In [38]:
y_test.value_counts()

0    577
1     31
Name: ckd, dtype: int64

In [39]:
from scipy.stats import randint
from sklearn.model_selection import RandomizedSearchCV

In [40]:
param_grid_rcv = {
    'n_estimators': randint(50, 500),
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : randint(1, 10),
    'criterion' :['gini', 'entropy']
}

In [41]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 6],
    'max_features': ['sqrt', 'log2','auto']
}

In [42]:
#runder = RandomUnderSampler(random_state=42)
#X_resampled, y_resampled = runder.fit_resample(X_train, y_train)
rf_merged = RandomForestClassifier(random_state=42)

#grid_search = RandomizedSearchCV(estimator=rf_merged, param_distributions=param_grid_rcv, n_iter=100, cv=5, random_state=42)

#grid_search = GridSearchCV(rf_merged, param_grid, cv=5)
#grid_search.fit(X_train, y_train)
#grid_search.best_params_

In [43]:
evaluationCV(rf_merged,X_train, y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores Precision    : [0. 0. 0. 0. 0.]
Cross-validation scores Recall       : [0. 0. 0. 0. 0.]
Cross-validation scores F1           : [0. 0. 0. 0. 0.]
Cross-validation scores Accuracy     : [0.94855967 0.94444444 0.94855967 0.94855967 0.94650206]
Mean cross-validation score Precision: 0.0
Mean cross-validation score Recall   : 0.0
Mean cross-validation score F1       : 0.0
Mean cross-validation score Accuracy : 0.9473251028806585


In [44]:
rf_merged_2 = RandomForestClassifier(random_state=42,
                                     max_depth = None, 
                                     max_features = 'sqrt',
                                     min_samples_leaf = 1,
                                     min_samples_split = 1,
                                     n_estimators= 100)

rf_merged_2.fit(X_train, y_train)

In [45]:
evaluationCV(rf_merged_2,X_train, y_train)

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores Precision    : [0. 0. 0. 0. 0.]
Cross-validation scores Recall       : [0. 0. 0. 0. 0.]
Cross-validation scores F1           : [0. 0. 0. 0. 0.]
Cross-validation scores Accuracy     : [0.94855967 0.94444444 0.94855967 0.94855967 0.94650206]
Mean cross-validation score Precision: 0.0
Mean cross-validation score Recall   : 0.0
Mean cross-validation score F1       : 0.0
Mean cross-validation score Accuracy : 0.9473251028806585


In [46]:
y_pred_merged_2 = evaluationTest(rf_merged_2,X_test, y_test)

Precision: 0.5
Recall: 0.03225806451612903
F1 Score: 0.06060606060606061
Accuracy: 0.9490131578947368


In [47]:
y_pred_merged_2.value_counts()

0    606
1      2
dtype: int64

#### Trial 2 - RandomUnderSampler

In [48]:
X_top =X[['creatinine', 'specificgravity', 'heartrate', 'bun', 'spo2', 'tempc', 'platelet', 'diasbp', 'bacteria', 'meanbp']]
X_onehot = pd.get_dummies(X_top)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

undersampler = RandomUnderSampler()
#undersampler = RandomUnderSampler(sampling_strategy={0: 250})
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

rf_ros = RandomForestClassifier(random_state=42,
                                max_depth = None, 
                                max_features = 'sqrt',
                                min_samples_leaf = 1,
                                min_samples_split = 1,
                                n_estimators= 100)

rf_ros.fit(X_resampled, y_resampled)
evaluationCV(rf_ros,X_resampled, y_resampled)

rf_ros.fit(X_resampled, y_resampled)
y_pred_ros = evaluationTest(rf_ros,X_test, y_test)
y_pred_ros.value_counts()

Cross-validation scores Precision    : [0.72413793 0.85185185 0.78571429 0.84       0.7826087 ]
Cross-validation scores Recall       : [0.84 0.92 0.88 0.84 0.72]
Cross-validation scores F1           : [0.77777778 0.88461538 0.83018868 0.84       0.75      ]
Cross-validation scores Accuracy     : [0.76 0.88 0.82 0.84 0.76]
Mean cross-validation score Precision: 0.7968625528505588
Mean cross-validation score Recall   : 0.8400000000000001
Mean cross-validation score F1       : 0.8165163683276889
Mean cross-validation score Accuracy : 0.8119999999999999
Precision: 0.14285714285714285
Recall: 0.967741935483871
F1 Score: 0.24896265560165973
Accuracy: 0.7023026315789473


0    398
1    210
dtype: int64

In [49]:
cm = confusion_matrix(y_test, y_pred_ros)
print(cm)

[[397 180]
 [  1  30]]


In [50]:
X_onehot = pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

undersampler = RandomUnderSampler()
#undersampler = RandomUnderSampler(sampling_strategy={0: 250})
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

rf_ros = RandomForestClassifier(random_state=42,
                                max_depth = None, 
                                max_features = 'sqrt',
                                min_samples_leaf = 1,
                                min_samples_split = 1,
                                n_estimators= 100)

rf_ros.fit(X_resampled, y_resampled)
evaluationCV(rf_ros,X_resampled, y_resampled)

rf_ros.fit(X_resampled, y_resampled)
y_pred_ros = evaluationTest(rf_ros,X_test, y_test)
y_pred_ros.value_counts()

Cross-validation scores Precision    : [0.72727273 0.71875    0.9047619  0.59459459 0.64285714]
Cross-validation scores Recall       : [0.96 0.92 0.76 0.88 0.72]
Cross-validation scores F1           : [0.82758621 0.80701754 0.82608696 0.70967742 0.67924528]
Cross-validation scores Accuracy     : [0.8  0.78 0.84 0.64 0.66]
Mean cross-validation score Precision: 0.7176472738972739
Mean cross-validation score Recall   : 0.8479999999999999
Mean cross-validation score F1       : 0.7699226819303293
Mean cross-validation score Accuracy : 0.744
Precision: 0.1477832512315271
Recall: 0.967741935483871
F1 Score: 0.25641025641025644
Accuracy: 0.7138157894736842


0    405
1    203
dtype: int64

In [120]:
def cross_val_with_sampling(clf, X,y, resampling = None):
    from sklearn.model_selection import StratifiedKFold
    
    X = X.values
    y = y.values
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, test_index in kfold.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        if resampling == None:
            print("No resampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
        elif (resampling.lower() == 'under'):
            sampler = RandomUnderSampler(random_state=42)
            X_train, y_train = sampler.fit_resample(X_train, y_train)
            print("Under sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
        elif (resamplg.lower() == 'over'):
            sampler = SMOTE(random_state=42)
            X_train, y_train = sampler.fit_resample(X_train, y_train)
            print("Over sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
        
        clf.fit(X_train, y_train)
        y_pred_validation = evaluationTest(clf, X_test, y_test)
    
    return clf, y_pred_validation

In [122]:
X = merged_table_filtered.drop(['ckd','icustay_id'],axis=1)
y = merged_table_filtered['ckd']

In [123]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, stratify=y, random_state=42)
X_onehot_train = pd.get_dummies(X_train)
X_onehot_test = pd.get_dummies(X_test)

rf_1, y_val = cross_val_with_sampling(rf_ros, X_onehot_train,y_train, resampling = "under")
print("----------------------TEST-------------------------------------")
evaluationTest(rf_1, X_onehot_test, y_test)

Under sampling: Train: 200 Test: 486
Precision: 0.11392405063291139
Recall: 0.72
F1 Score: 0.19672131147540986
Accuracy: 0.6975308641975309
Under sampling: Train: 200 Test: 486
Precision: 0.13924050632911392
Recall: 0.88
F1 Score: 0.24043715846994534
Accuracy: 0.7139917695473251
Under sampling: Train: 200 Test: 486
Precision: 0.14838709677419354
Recall: 0.92
F1 Score: 0.2555555555555556
Accuracy: 0.7242798353909465
Under sampling: Train: 200 Test: 486
Precision: 0.11864406779661017
Recall: 0.84
F1 Score: 0.2079207920792079
Accuracy: 0.6707818930041153
Under sampling: Train: 200 Test: 486
Precision: 0.15328467153284672
Recall: 0.84
F1 Score: 0.25925925925925924
Accuracy: 0.7530864197530864
----------------------TEST-------------------------------------
Precision: 0.15025906735751296
Recall: 0.9354838709677419
F1 Score: 0.25892857142857145
Accuracy: 0.7269736842105263


Unnamed: 0,0
0,0
1,0
2,1
3,0
4,1
...,...
603,0
604,1
605,0
606,0


#### Trial 3: BRF

In [59]:
X_onehot = pd.get_dummies(X_top)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

ros = SMOTE(random_state=42)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

brf = BalancedRandomForestClassifier(n_estimators=100)
brf.fit(X_resampled, y_resampled)

evaluationCV(brf,X_resampled, y_resampled)

y_pred_brf = evaluationTest(brf,X_test, y_test)
y_pred_brf.value_counts()









































































































































































































































Cross-validation scores Precision    : [0.9287257  0.95378151 0.94802495 0.94045175 0.93852459]
Cross-validation scores Recall       : [0.94360087 0.98698482 0.98698482 0.98915401 0.99132321]
Cross-validation scores F1           : [0.93290043 0.9627263  0.96375267 0.96610169 0.96109359]
Cross-validation scores Accuracy     : [0.93492408 0.96637744 0.96637744 0.96203905 0.96420824]
Mean cross-validation score Precision: 0.9419016996235292
Mean cross-validation score Recall   : 0.9796095444685466
Mean cross-validation score F1       : 0.9573149366678987
Mean cross-validation score Accuracy : 0.9587852494577007
Precision: 0.25
Recall: 0.3225806451612903
F1 Score: 0.28169014084507044
Accuracy: 0.9161184210526315




0    568
1     40
dtype: int64

In [60]:
X_onehot = pd.get_dummies(X_top)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.20, stratify=y, random_state=42)

undersampler = RandomUnderSampler()
X_resampled, y_resampled = undersampler.fit_resample(X_train, y_train)

brf = BalancedRandomForestClassifier(n_estimators=100)
brf.fit(X_resampled, y_resampled)

evaluationCV(brf,X_resampled, y_resampled)

y_pred_brf = evaluationTest(brf,X_test, y_test)
y_pred_brf.value_counts()



































































Cross-validation scores Precision    : [0.77777778 0.73913043 0.74074074 0.67741935 0.82608696]
Cross-validation scores Recall       : [0.8  0.8  0.8  0.76 0.8 ]
Cross-validation scores F1           : [0.76       0.76       0.80769231 0.76363636 0.7826087 ]
Cross-validation scores Accuracy     : [0.76 0.8  0.78 0.74 0.76]
Mean cross-validation score Precision: 0.7522310529323153
Mean cross-validation score Recall   : 0.792
Mean cross-validation score F1       : 0.774787473396169
Mean cross-validation score Accuracy : 0.768
Precision: 0.15846994535519127
Recall: 0.9354838709677419
F1 Score: 0.27102803738317754
Accuracy: 0.743421052631579




0    425
1    183
dtype: int64

# 2. Experiments

demographics_windowed

merged_table

--------------------------------
static_demo_comorb

labs_windowed

vitals_windowed

In [61]:
age_ranges = [0, 9, 19, 29, 39, 49, 59, 69, 79, 89, 400]
age_labels = ['0-9', '10-19', '20-29', '30-39', '40-49', '50-59', '60-69', '70-79', '80-89', '90+']
demographics_windowed['age_group'] = pd.cut(merged_table['admission_age'], bins=age_ranges, labels=age_labels, right=False)

In [62]:
static_demo_comorb = demographics_windowed[['icustay_id','gender', 'ethnicity_grouped', 'label_cor_art', 'diabetes_mellitus', 'anemia_flag', 'age_group', 'ckd']]

In [63]:
# Get the unique icustay_id values from each DataFrame
icustay_id_df1 = set(static_demo_comorb['icustay_id'])
icustay_id_df2 = set(labs_windowed['icustay_id'])
icustay_id_df3 = set(vitals_windowed['icustay_id'])

# Check for missing icustay_id values
missing_from_df1 = icustay_id_df2.union(icustay_id_df3) - icustay_id_df1
missing_from_df2 = icustay_id_df1.union(icustay_id_df3) - icustay_id_df2
missing_from_df3 = icustay_id_df1.union(icustay_id_df2) - icustay_id_df3

# Print the missing icustay_id values
if missing_from_df1:
    print(f"Icustay_id missing from static_demo_comorb: {missing_from_df1}")
else:
    print("No icustay_id missing from static_demo_comorb")

if missing_from_df2:
    print(f"Icustay_id missing from labs_windowed: {missing_from_df2}")
else:
    print("No icustay_id missing from labs_windowed")

if missing_from_df3:
    print(f"Icustay_id missing from vitals_windowed: {missing_from_df3}")
else:
    print("No icustay_id missing from vitals_windowed")



No icustay_id missing from static_demo_comorb
No icustay_id missing from labs_windowed
No icustay_id missing from vitals_windowed


In [64]:
# Calculate the difference between max and min charttime for labs_windowed
labs_diff = labs_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min()).to_frame()
labs_diff.min()

charttime   2 days 16:00:00
dtype: timedelta64[ns]

In [65]:
# Calculate the difference between max and min charttime for vitals_windowed
vitals_diff = vitals_windowed.groupby('icustay_id')['charttime'].apply(lambda x: x.max() - x.min()).to_frame()
vitals_diff.min()

charttime   2 days 23:00:00
dtype: timedelta64[ns]

In [66]:
WINDOW_LENGTH_NEW = 96

In [67]:
filtered_icustay_ids = labs_diff[(labs_diff == pd.Timedelta(hours=WINDOW_LENGTH_NEW)) & (vitals_diff == pd.Timedelta(hours=WINDOW_LENGTH_NEW))].index.tolist()
print("Total : ",len(filtered_icustay_ids))
print(static_demo_comorb['ckd'].value_counts())

Total :  8405
0    7868
1     537
Name: ckd, dtype: int64


## 2.1 Random Forest - Comorbidity & Demographics

In [68]:
def RandomForestForMulti(X_train, X_test, y_train, y_test, resampling=None):        
    if resampling == None:
        print("No resampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    elif resampling.lower() == 'under':
        sampler = RandomUnderSampler()
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        print("under sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    elif resampling.lower() == 'over':
        sampler = SMOTE()
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        print("over sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    
    #rf_static_demo_comorb = RandomForestClassifier(n_estimators=300, random_state=42)
        
    rf_static_demo_comorb_best = RandomForestClassifier(n_estimators=200, 
                                                        max_depth=None,
                                                        min_samples_leaf=2,
                                                        min_samples_split=2,
                                                        max_features='sqrt',
                                                        random_state=42)
    
    rf_static_demo_comorb_best.fit(X_train, y_train)
    
    evaluationCV(rf_static_demo_comorb_best,X_train, y_train)
    
    y_pred = evaluationTest(rf_static_demo_comorb_best,X_test, y_test)
    
    cm  = confusion_matrix(y_test, y_pred)    
    print(cm)
    
    f1 = f1_score(y_test, y_pred)
    weight = np.log(f1/(1-f1))
    
    proba = rf_static_demo_comorb_best.predict_proba(X_test)
    
    return rf_static_demo_comorb_best, weight, proba    

In [69]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2']
}

X=static_demo_comorb.drop(['ckd','icustay_id'],axis=1)
y=static_demo_comorb['ckd']

X_onehot=pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2,stratify = y, random_state=42)
rf_default, _, _ = RandomForestForMulti(X_train, X_test, y_train, y_test)

No resampling: Train: 6724 Test: 1681


  _warn_prf(average, modifier, msg_start, len(result))


Cross-validation scores Precision    : [0. 1. 1. 0. 1.]
Cross-validation scores Recall       : [0.         0.01162791 0.02325581 0.         0.02325581]
Cross-validation scores F1           : [0.         0.02298851 0.04545455 0.         0.04545455]
Cross-validation scores Accuracy     : [0.93457249 0.93680297 0.93754647 0.93605948 0.9375    ]
Mean cross-validation score Precision: 0.6
Mean cross-validation score Recall   : 0.011627906976744186
Mean cross-validation score F1       : 0.022779519331243465
Mean cross-validation score Accuracy : 0.9364962825278811
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9363474122546104
[[1574    0]
 [ 107    0]]


  _warn_prf(average, modifier, msg_start, len(result))
  weight = np.log(f1/(1-f1))


In [70]:
X=static_demo_comorb.drop(['ckd','icustay_id'],axis=1)
y=static_demo_comorb['ckd']

X_onehot=pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2,stratify = y, random_state=42)
rf_under_sampled, _, _ = RandomForestForMulti(X_train, X_test, y_train, y_test, resampling='under')

under sampling: Train: 860 Test: 1681
Cross-validation scores Precision    : [0.62318841 0.62121212 0.59782609 0.65277778 0.59550562]
Cross-validation scores Recall       : [0.5        0.47674419 0.63953488 0.54651163 0.61627907]
Cross-validation scores F1           : [0.55483871 0.53947368 0.61797753 0.59493671 0.60571429]
Cross-validation scores Accuracy     : [0.59883721 0.59302326 0.60465116 0.62790698 0.59883721]
Mean cross-validation score Precision: 0.61810200194421
Mean cross-validation score Recall   : 0.555813953488372
Mean cross-validation score F1       : 0.5825881833105756
Mean cross-validation score Accuracy : 0.6046511627906976
Precision: 0.12877263581488935
Recall: 0.5981308411214953
F1 Score: 0.2119205298013245
Accuracy: 0.7168352171326592
[[1141  433]
 [  43   64]]


In [71]:
X=static_demo_comorb.drop(['ckd','icustay_id'],axis=1)
y=static_demo_comorb['ckd']

X_onehot=pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2,stratify = y, random_state=42)
rf_over_sampled, _, _ = RandomForestForMulti(X_train, X_test, y_train, y_test, resampling='over')

over sampling: Train: 12588 Test: 1681
Cross-validation scores Precision    : [0.73049645 0.73369565 0.7173913  0.71241259 0.70955882]
Cross-validation scores Recall       : [0.65448769 0.64336775 0.62907069 0.64785374 0.61318507]
Cross-validation scores F1           : [0.69040637 0.68556919 0.67033432 0.67860117 0.6578611 ]
Cross-validation scores Accuracy     : [0.70651311 0.70492454 0.69062748 0.69328566 0.68096941]
Mean cross-validation score Precision: 0.7207109642728895
Mean cross-validation score Recall   : 0.6375929870907211
Mean cross-validation score F1       : 0.6765544290561557
Mean cross-validation score Accuracy : 0.695264039322125
Precision: 0.12060301507537688
Recall: 0.4485981308411215
F1 Score: 0.1900990099009901
Accuracy: 0.7566924449732302
[[1224  350]
 [  59   48]]


## 2.2 Time series - Data preparation

In [72]:
def print_unique_shape(grouped_data,feature_cols):
    previous_shape = []
    for _, group in grouped_data:
        group_values = group[feature_cols].values.T
        if group_values.shape not in previous_shape:        
            print(group_values.shape)
            previous_shape.append(group_values.shape)

In [73]:
def check_shape_in_grouped_df(data_grouped, feature_cols, icustay_id):
    for _, group in data_grouped:
        if(group['icustay_id'].values[0]==icustay_id):
            group_values = group[feature_cols].values.T
            return group_values.shape

In [74]:
def check_missing_and_extras(data_windowed,feature_cols, threshold):
    data_windowed_new = data_windowed.copy()
    df_counts = data_windowed_new.groupby('icustay_id').count()
    previous_shape = []
    
    icustay_ids_less_records = df_counts[df_counts['charttime'] < threshold].index
    icustay_ids_more_records = df_counts[df_counts['charttime'] > threshold].index
    icustay_ids_correct_records = df_counts[df_counts['charttime'] == threshold].index
     
    print("len(icustay_ids_fewer_records)",len(icustay_ids_less_records))
    print("len(icustay_ids_more_records)",len(icustay_ids_more_records))  
    print("len(icustay_ids_correct_records)",len(icustay_ids_correct_records))
        
    for icustay_id in icustay_ids_more_records:
        df_grouped = data_windowed_new[data_windowed_new['icustay_id'] == icustay_id]
        if df_grouped.shape not in previous_shape:        
            print(f"There are records with more than {threshold} readings : {df_grouped.shape}")
            previous_shape.append(df_grouped.shape)
        
        # Check if the time span is more than 4 days
        if (df_grouped['charttime'].max() - df_grouped['charttime'].min()).days > 4:
            print(f"icustay_id: {icustay_id} has a time span of more than 4 days.")
        
        # Check for duplicate records
        if df_grouped.duplicated().sum() > 0:
            print(f"icustay_id: {icustay_id} has {df_grouped.duplicated().sum()} duplicate records.")
            
    for icustay_id in icustay_ids_less_records:
        print(f"icustay_id: {icustay_id} has a time span less than {threshold} records")


In [75]:
def backward_forward_fill(data_windowed, time_interval, threshold, feature_cols):    
    data_windowed_new = data_windowed.copy()
    data_windowed_new['charttime'] = pd.to_datetime(data_windowed_new['charttime']) 
    data_windowed_new.sort_values(['icustay_id', 'charttime'])
    df_filled = data_windowed_new.groupby('icustay_id').apply(lambda group: group.bfill().ffill())
    return df_filled    

In [76]:
def create_threshold_records(df, time_interval, threshold, feature_cols):
    df_new = pd.DataFrame()
    for id, group in df.groupby('icustay_id'):
        if len(group) > threshold:
            group = group.head(threshold)
        elif len(group) < threshold:
            missing_rows_count = threshold - len(group)
            last_timestamp = group['charttime'].max()
            missing_rows_df = pd.DataFrame({
                'icustay_id': [id]*missing_rows_count,
                'charttime': pd.date_range(start=last_timestamp + pd.Timedelta(hours=time_interval), 
                                           periods=missing_rows_count, 
                                           freq=f'{time_interval}H'),
                'ckd': [group['ckd'].iloc[0]]*missing_rows_count
            })
            for col in feature_cols:
                missing_rows_df[col] = np.nan
            group = pd.concat([group, missing_rows_df])
        df_new = pd.concat([df_new, group])
    df_new.sort_values(['icustay_id', 'charttime'], inplace=True)
    df_new.reset_index(drop=True, inplace=True)
    return df_new

### 2.2.1 Labs

In [77]:
feature_labs= ['aniongap', 'albumin', 'bands',
       'bicarbonate', 'bilirubin', 'creatinine', 'chloride', 'glucose',
       'hematocrit', 'hemoglobin', 'lactate', 'platelet', 'potassium', 'ptt',
       'inr', 'pt', 'sodium', 'bun', 'wbc', 'bacteria']

In [78]:
labs_windowed_12 = create_threshold_records(labs_windowed, 8, 12, feature_labs)
labs_windowed_12 = backward_forward_fill(labs_windowed_12, 8, 12, feature_labs)

labs_windowed_12['charttime'] = pd.to_datetime(labs_windowed_12['charttime']) 
labs_windowed_12.sort_values(['icustay_id', 'charttime'])
labs_grouped = labs_windowed_12[['icustay_id'] + feature_labs + ['ckd']].groupby(['icustay_id'])

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_filled = data_windowed_new.groupby('icustay_id').apply(lambda group: group.bfill().ffill())


In [79]:
check_missing_and_extras(labs_windowed_12,feature_labs, 12)

len(icustay_ids_fewer_records) 0
len(icustay_ids_more_records) 0
len(icustay_ids_correct_records) 8405


In [80]:
print_unique_shape(labs_grouped,feature_labs)

(20, 12)


  for _, group in grouped_data:


### 2.2.2 Vitals

In [81]:
feature_vitals = ['heartrate', 'sysbp', 'diasbp', 'meanbp', 'resprate', 
                  'tempc', 'spo2', 'glucose', 'rbc', 'specificgravity', 'appetite_median']

vitals_windowed_96 = create_threshold_records(vitals_windowed, 1, 96, feature_vitals)
vitals_windowed_96 = backward_forward_fill(vitals_windowed_96, 1, 96, feature_vitals)

vitals_windowed_96['charttime'] = pd.to_datetime(vitals_windowed_96['charttime']) 
vitals_windowed_96.sort_values(['icustay_id', 'charttime'])
vitals_grouped = vitals_windowed_96[['icustay_id'] + feature_vitals + ['ckd']].groupby(['icustay_id'])

To preserve the previous behavior, use

	>>> .groupby(..., group_keys=False)


	>>> .groupby(..., group_keys=True)
  df_filled = data_windowed_new.groupby('icustay_id').apply(lambda group: group.bfill().ffill())


In [82]:
check_missing_and_extras(vitals_windowed_96,feature_vitals, 96)

len(icustay_ids_fewer_records) 0
len(icustay_ids_more_records) 0
len(icustay_ids_correct_records) 8405


In [83]:
print_unique_shape(vitals_grouped,feature_vitals)

(11, 96)


  for _, group in grouped_data:


## 2.3 Rocket - Time series model

In [84]:
print("labs_grouped['icustay_id'].nunique(): ",len(labs_grouped['icustay_id'].nunique()))
print("vitals_grouped['icustay_id'].nunique(): ",len(vitals_grouped['icustay_id'].nunique()))

labs_grouped['icustay_id'].nunique():  8405
vitals_grouped['icustay_id'].nunique():  8405


In [129]:
def cross_val_with_sampling(clf, X,y, resampling = None):
    from sklearn.model_selection import StratifiedKFold
    
    #X = X.values
    #y = y.values
    
    kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    for train_index, test_index in kfold.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        
        if resampling == None:
            print("No resampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
        elif (resampling.lower() == 'under'):
            sampler = RandomUnderSampler(random_state=42)
            X_train, y_train = sampler.fit_resample(X_train, y_train)
            print("Under sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
        elif (resamplg.lower() == 'over'):
            sampler = SMOTE(random_state=42)
            X_train, y_train = sampler.fit_resample(X_train, y_train)
            print("Over sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
        
        clf.fit(X_train, y_train)
        y_pred_validation = evaluationTest(clf, X_test, y_test)
    
    return clf, y_pred_validation


def RocketMultiCV(grouped_data,feature_columns, num_kernels=100, resampling=None, filtered_test_ids = None):  
    import warnings
    import logging
    warnings.filterwarnings('ignore')   
    logging.getLogger().setLevel(logging.ERROR)
    
    if filtered_test_ids is None:
        X = []
        y = []
    else:
        X_train = []
        y_train = []
        X_test = []
        y_test = []
        
    for icustay_id, group in grouped_data:
        group_values = group[feature_columns].values.T
        num_timestamps = group_values.shape[1]
        
        if filtered_test_ids is None:
            X.append(group_values)
            y.append(group['ckd'].iloc[0])
        else:
            if filtered_test_ids.isin([icustay_id]).any():               
                X_test.append(group_values)
                y_test.append(group['ckd'].iloc[0])
            else:
                X_train.append(group_values)
                y_train.append(group['ckd'].iloc[0])
    
    if filtered_test_ids is None:    
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features, n_channels = X.shape
        X_2d = X.reshape((n_samples, n_features*n_channels))          
        X_train_2d, X_test_2d, y_train, y_test = train_test_split(X_2d, y, test_size=0.2, stratify=y)
    else:
        X_train = np.array(X_train)
        n_samples, n_features, n_channels = X_train.shape
        X_train_2d = X_train.reshape((n_samples, n_features*n_channels))        
        y_train = np.array(y_train)
        
        X_test = np.array(X_test)
        n_samples, n_features, n_channels = X_test.shape
        X_test_2d = X_test.reshape((n_samples, n_features*n_channels)) 
        y_test = np.array(y_test)
    
    if resampling == None:
        print("No resampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    elif (resampling.lower() == 'under'):
        sampler = RandomUnderSampler(random_state=42)
        X_train_2d, y_train = sampler.fit_resample(X_train_2d, y_train)
        print("Under sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    elif (resampling.lower() == 'over'):
        sampler = SMOTE(random_state=42)
        X_train_2d, y_train = sampler.fit_resample(X_train_2d, y_train)
        print("Over sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    
    X_train = X_train_2d.reshape((X_train_2d.shape[0], n_features, n_channels))
    X_test = X_test_2d.reshape((X_test_2d.shape[0], n_features, n_channels))

    print("X_train shape: ",X_train.shape,"\ny_train shape: ",y_train.shape)
    
    rocket = Rocket(num_kernels, random_state=42)
    rocket.fit(X_train)
    
    X_train_transformed = rocket.transform(X_train)
    X_test_transformed = rocket.transform(X_test)

    # Reshape transformed data back into 2D for logistic regression
    X_train_transformed_2d = X_train_transformed.values.reshape((X_train_transformed.shape[0], -1))
    X_test_transformed_2d = X_test_transformed.values.reshape((X_test_transformed.shape[0], -1))
    
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train_transformed_2d, y_train)
    
    #evaluationCV(clf,X_train_transformed_2d, y_train)
    
    rocketCv, y_val = cross_val_with_sampling(clf, X_train_transformed_2d,y_train, resampling = resampling)
    print("----------------------TEST-------------------------------------")
    evaluationTest(rocketCv, X_test_transformed_2d, y_test)
    print("----------------------TEST-------------------------------------")
        
    y_pred = evaluationTest(clf,X_test_transformed_2d, y_test)
    
    cm  = confusion_matrix(y_test, y_pred)
    print(cm)
    
    f1 = f1_score(y_test, y_pred)
    weight = np.log(f1/(1-f1))
    
    proba = clf.predict_proba(X_test_transformed_2d)
    
    return clf, weight, proba

In [132]:
clf_lab_default, _, _ = RocketMultiCV(labs_grouped,feature_labs, num_kernels = 50)

No resampling: Train: 6724 Test: 1681
X_train shape:  (6724, 20, 12) 
y_train shape:  (6724,)
No resampling: Train: 5379 Test: 1345
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9360594795539033
No resampling: Train: 5379 Test: 1345
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9345724907063196
No resampling: Train: 5379 Test: 1345
Precision: 1.0
Recall: 0.011627906976744186
F1 Score: 0.022988505747126436
Accuracy: 0.9368029739776952
No resampling: Train: 5379 Test: 1345
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9360594795539033
No resampling: Train: 5380 Test: 1344
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9352678571428571
----------------------TEST-------------------------------------
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9363474122546104
----------------------TEST-------------------------------------
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9363474122546104
[[1574    0]
 [ 107    0]]


In [85]:
def RocketMulti(grouped_data,feature_columns, num_kernels=100, resampling=None, filtered_test_ids = None):  
    import warnings
    import logging
    warnings.filterwarnings('ignore')   
    logging.getLogger().setLevel(logging.ERROR)
    
    if filtered_test_ids is None:
        X = []
        y = []
    else:
        X_train = []
        y_train = []
        X_test = []
        y_test = []
        
    for icustay_id, group in grouped_data:
        group_values = group[feature_columns].values.T
        num_timestamps = group_values.shape[1]
        
        if filtered_test_ids is None:
            X.append(group_values)
            y.append(group['ckd'].iloc[0])
        else:
            if filtered_test_ids.isin([icustay_id]).any():               
                X_test.append(group_values)
                y_test.append(group['ckd'].iloc[0])
            else:
                X_train.append(group_values)
                y_train.append(group['ckd'].iloc[0])
    
    if filtered_test_ids is None:    
        X = np.array(X)
        y = np.array(y)
        n_samples, n_features, n_channels = X.shape
        X_2d = X.reshape((n_samples, n_features*n_channels))          
        X_train_2d, X_test_2d, y_train, y_test = train_test_split(X_2d, y, test_size=0.2, stratify=y)
    else:
        X_train = np.array(X_train)
        n_samples, n_features, n_channels = X_train.shape
        X_train_2d = X_train.reshape((n_samples, n_features*n_channels))        
        y_train = np.array(y_train)
        
        X_test = np.array(X_test)
        n_samples, n_features, n_channels = X_test.shape
        X_test_2d = X_test.reshape((n_samples, n_features*n_channels)) 
        y_test = np.array(y_test)
    
    if resampling == None:
        print("No resampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    elif (resampling.lower() == 'under'):
        sampler = RandomUnderSampler(random_state=42)
        X_train_2d, y_train = sampler.fit_resample(X_train_2d, y_train)
        print("Under sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    elif (resampling.lower() == 'over'):
        sampler = SMOTE(random_state=42)
        X_train_2d, y_train = sampler.fit_resample(X_train_2d, y_train)
        print("Over sampling: Train:", y_train.shape[0] , "Test:", y_test.shape[0])
    
    X_train = X_train_2d.reshape((X_train_2d.shape[0], n_features, n_channels))
    X_test = X_test_2d.reshape((X_test_2d.shape[0], n_features, n_channels))

    print("X_train shape: ",X_train.shape,"\ny_train shape: ",y_train.shape)
    
    rocket = Rocket(num_kernels, random_state=42)
    rocket.fit(X_train)
    
    X_train_transformed = rocket.transform(X_train)
    X_test_transformed = rocket.transform(X_test)

    # Reshape transformed data back into 2D for logistic regression
    X_train_transformed_2d = X_train_transformed.values.reshape((X_train_transformed.shape[0], -1))
    X_test_transformed_2d = X_test_transformed.values.reshape((X_test_transformed.shape[0], -1))
    
    clf = LogisticRegression(random_state=42, max_iter=1000)
    clf.fit(X_train_transformed_2d, y_train)
    
    evaluationCV(clf,X_train_transformed_2d, y_train)
        
    y_pred = evaluationTest(clf,X_test_transformed_2d, y_test)
    
    cm  = confusion_matrix(y_test, y_pred)
    print(cm)
    
    f1 = f1_score(y_test, y_pred)
    weight = np.log(f1/(1-f1))
    
    proba = clf.predict_proba(X_test_transformed_2d)
    
    return clf, weight, proba

### 2.3.1 Rocket for Labs

In [86]:
clf_lab_default, _, _ = RocketMulti(labs_grouped,feature_labs)

No resampling: Train: 6724 Test: 1681
X_train shape:  (6724, 20, 12) 
y_train shape:  (6724,)
Cross-validation scores Precision    : [0.25       0.6        0.         0.33333333 0.25      ]
Cross-validation scores Recall       : [0.01162791 0.03488372 0.         0.01162791 0.01162791]
Cross-validation scores F1           : [0.02222222 0.06593407 0.         0.02247191 0.02222222]
Cross-validation scores Accuracy     : [0.93457249 0.93680297 0.9330855  0.93531599 0.93452381]
Mean cross-validation score Precision: 0.2866666666666667
Mean cross-validation score Recall   : 0.013953488372093023
Mean cross-validation score F1       : 0.026570084098173984
Mean cross-validation score Accuracy : 0.9348601522393343
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.934562760261749
[[1571    3]
 [ 107    0]]


#### 2.3.1.1 Undersampling -labs

In [87]:
clf_lab_under_sampled_default, _, _ = RocketMulti(labs_grouped,feature_labs, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 20, 12) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.6043956  0.63013699 0.60714286 0.56179775 0.58762887]
Cross-validation scores Recall       : [0.63953488 0.53488372 0.59302326 0.58139535 0.6627907 ]
Cross-validation scores F1           : [0.62146893 0.57861635 0.6        0.57142857 0.62295082]
Cross-validation scores Accuracy     : [0.61046512 0.61046512 0.60465116 0.56395349 0.59883721]
Mean cross-validation score Precision: 0.5982204133256404
Mean cross-validation score Recall   : 0.6023255813953489
Mean cross-validation score F1       : 0.5988929339711266
Mean cross-validation score Accuracy : 0.5976744186046512
Precision: 0.10941828254847645
Recall: 0.7383177570093458
F1 Score: 0.1905910735826297
Accuracy: 0.6008328375966686
[[931 643]
 [ 28  79]]


In [88]:
clf_lab_under_sampled_10, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 10, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 20, 12) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.56565657 0.58333333 0.55752212 0.6        0.67045455]
Cross-validation scores Recall       : [0.65116279 0.65116279 0.73255814 0.62790698 0.68604651]
Cross-validation scores F1           : [0.60540541 0.61538462 0.63316583 0.61363636 0.67816092]
Cross-validation scores Accuracy     : [0.5755814  0.59302326 0.5755814  0.60465116 0.6744186 ]
Mean cross-validation score Precision: 0.5953933136676499
Mean cross-validation score Recall   : 0.6697674418604651
Mean cross-validation score F1       : 0.6291506266224686
Mean cross-validation score Accuracy : 0.6046511627906976
Precision: 0.08647990255785627
Recall: 0.6635514018691588
F1 Score: 0.15301724137931033
Accuracy: 0.5324211778703153
[[824 750]
 [ 36  71]]


In [89]:
clf_lab_under_sampled_25, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 25, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 20, 12) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.60576923 0.63366337 0.66666667 0.60240964 0.62886598]
Cross-validation scores Recall       : [0.73255814 0.74418605 0.76744186 0.58139535 0.70930233]
Cross-validation scores F1           : [0.66315789 0.68449198 0.71351351 0.59171598 0.66666667]
Cross-validation scores Accuracy     : [0.62790698 0.65697674 0.69186047 0.59883721 0.64534884]
Mean cross-validation score Precision: 0.6274749763416383
Mean cross-validation score Recall   : 0.7069767441860465
Mean cross-validation score F1       : 0.6639092059716016
Mean cross-validation score Accuracy : 0.6441860465116278
Precision: 0.11142454160789844
Recall: 0.7383177570093458
F1 Score: 0.19362745098039214
Accuracy: 0.6085663295657346
[[944 630]
 [ 28  79]]


In [90]:
clf_lab_under_sampled_50, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 50, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 20, 12) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.59375    0.69047619 0.62135922 0.60416667 0.7       ]
Cross-validation scores Recall       : [0.6627907  0.6744186  0.74418605 0.6744186  0.65116279]
Cross-validation scores F1           : [0.62637363 0.68235294 0.67724868 0.63736264 0.6746988 ]
Cross-validation scores Accuracy     : [0.60465116 0.68604651 0.64534884 0.61627907 0.68604651]
Mean cross-validation score Precision: 0.6419504160887655
Mean cross-validation score Recall   : 0.6813953488372093
Mean cross-validation score F1       : 0.6596073354684269
Mean cross-validation score Accuracy : 0.6476744186046511
Precision: 0.1087866108786611
Recall: 0.7289719626168224
F1 Score: 0.18932038834951456
Accuracy: 0.60261748958953
[[935 639]
 [ 29  78]]


In [91]:
clf_lab_under_sampled_75, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 75, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 20, 12) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.62962963 0.55       0.65909091 0.60638298 0.68686869]
Cross-validation scores Recall       : [0.59302326 0.63953488 0.6744186  0.6627907  0.79069767]
Cross-validation scores F1           : [0.61077844 0.59139785 0.66666667 0.63333333 0.73513514]
Cross-validation scores Accuracy     : [0.62209302 0.55813953 0.6627907  0.61627907 0.71511628]
Mean cross-validation score Precision: 0.626394440862526
Mean cross-validation score Recall   : 0.6720930232558139
Mean cross-validation score F1       : 0.6474622855422545
Mean cross-validation score Accuracy : 0.6348837209302325
Precision: 0.09259259259259259
Recall: 0.6074766355140186
F1 Score: 0.16069221260815822
Accuracy: 0.5960737656157049
[[937 637]
 [ 42  65]]


#### 2.3.1.2 Oversampling - labs

In [92]:
clf_lab_over_sampled_default, _, _ = RocketMulti(labs_grouped,feature_labs, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 20, 12) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.7640358  0.74965706 0.76491733 0.75982533 0.74824684]
Cross-validation scores Recall       : [0.74583002 0.86814932 0.84511517 0.82988871 0.84749801]
Cross-validation scores F1           : [0.75482315 0.80456386 0.80301887 0.79331307 0.79478585]
Cross-validation scores Accuracy     : [0.75774424 0.78911835 0.79269261 0.78386969 0.7810886 ]
Mean cross-validation score Precision: 0.7573364726864333
Mean cross-validation score Recall   : 0.8272962491997207
Mean cross-validation score F1       : 0.790100958690799
Mean cross-validation score Accuracy : 0.7809026972425472
Precision: 0.10309278350515463
Recall: 0.4672897196261682
F1 Score: 0.16891891891891891
Accuracy: 0.7073170731707317
[[1139  435]
 [  57   50]]


In [93]:
clf_lab_over_sampled_5, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 5, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 20, 12) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.60505415 0.62127976 0.61466571 0.61505065 0.60938628]
Cross-validation scores Recall       : [0.66560763 0.66322478 0.67911041 0.67567568 0.67037331]
Cross-validation scores F1           : [0.63388805 0.64156742 0.64528302 0.64393939 0.63842663]
Cross-validation scores Accuracy     : [0.61556791 0.62946783 0.62668785 0.62653953 0.62018276]
Mean cross-validation score Precision: 0.6130873108943022
Mean cross-validation score Recall   : 0.6707983599167078
Mean cross-validation score F1       : 0.6406209019495432
Mean cross-validation score Accuracy : 0.6236891757179063
Precision: 0.09699453551912568
Recall: 0.6635514018691588
F1 Score: 0.16924910607866506
Accuracy: 0.5853658536585366
[[913 661]
 [ 36  71]]


In [94]:
clf_lab_over_sampled_10, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 10, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 20, 12) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.63022284 0.64049587 0.62843729 0.62032463 0.6449623 ]
Cross-validation scores Recall       : [0.71882446 0.73868149 0.74424146 0.69872814 0.74741859]
Cross-validation scores F1           : [0.6716141  0.68609369 0.68145455 0.65719626 0.6924209 ]
Cross-validation scores Accuracy     : [0.64853058 0.66203336 0.65210485 0.63567739 0.66785856]
Mean cross-validation score Precision: 0.6328885863699018
Mean cross-validation score Recall   : 0.7295788289340596
Mean cross-validation score F1       : 0.6777558994811339
Mean cross-validation score Accuracy : 0.6532409480504768
Precision: 0.08677685950413223
Recall: 0.5887850467289719
F1 Score: 0.15126050420168066
Accuracy: 0.5794170136823319
[[911 663]
 [ 44  63]]


In [95]:
clf_lab_over_sampled_25, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 25, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 20, 12) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.70704671 0.69978708 0.71974063 0.72036906 0.71324067]
Cross-validation scores Recall       : [0.70929309 0.78316124 0.79348689 0.80683625 0.77442415]
Cross-validation scores F1           : [0.70816812 0.73913043 0.75481677 0.76115486 0.74257426]
Cross-validation scores Accuracy     : [0.70770453 0.72359015 0.74225576 0.74692094 0.7314263 ]
Mean cross-validation score Precision: 0.7120368320581403
Mean cross-validation score Recall   : 0.773440323470693
Mean cross-validation score F1       : 0.7411688884193465
Mean cross-validation score Accuracy : 0.7303795351261936
Precision: 0.10017574692442882
Recall: 0.5327102803738317
F1 Score: 0.16863905325443784
Accuracy: 0.6656751933372992
[[1062  512]
 [  50   57]]


In [96]:
clf_lab_over_sampled_50, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 50, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 20, 12) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.71796875 0.73180873 0.74893314 0.73449477 0.75161522]
Cross-validation scores Recall       : [0.7299444  0.83876092 0.83637808 0.83783784 0.83161239]
Cross-validation scores F1           : [0.72390705 0.78164323 0.7902439  0.78277014 0.78959276]
Cross-validation scores Accuracy     : [0.72160445 0.76568705 0.77799841 0.76758045 0.77830751]
Mean cross-validation score Precision: 0.7369641235899569
Mean cross-validation score Recall   : 0.8149067256295215
Mean cross-validation score F1       : 0.7736314169397398
Mean cross-validation score Accuracy : 0.7622355748976853
Precision: 0.10986964618249534
Recall: 0.5514018691588785
F1 Score: 0.18322981366459629
Accuracy: 0.6870910172516359
[[1096  478]
 [  48   59]]


In [97]:
clf_lab_over_sampled_75, _, _ = RocketMulti(labs_grouped,feature_labs, num_kernels = 75, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 20, 12) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.72376543 0.75035765 0.74184397 0.75841874 0.73426573]
Cross-validation scores Recall       : [0.74503574 0.83320095 0.83081811 0.82352941 0.83399523]
Cross-validation scores F1           : [0.73424658 0.78961234 0.78381416 0.78963415 0.78095946]
Cross-validation scores Accuracy     : [0.73034154 0.77799841 0.77084988 0.7806913  0.76599126]
Mean cross-validation score Precision: 0.7417303065272061
Mean cross-validation score Recall   : 0.8133158902957529
Mean cross-validation score F1       : 0.7756533387052607
Mean cross-validation score Accuracy : 0.7651744783604927
Precision: 0.11180124223602485
Recall: 0.5046728971962616
F1 Score: 0.18305084745762712
Accuracy: 0.7132659131469363
[[1145  429]
 [  53   54]]


### 2.3.2 Rocket for Vitals

In [98]:
clf_vitals_default, _, _ = RocketMulti(vitals_grouped,feature_vitals)

No resampling: Train: 6724 Test: 1681
X_train shape:  (6724, 11, 96) 
y_train shape:  (6724,)
Cross-validation scores Precision    : [0. 0. 0. 0. 0.]
Cross-validation scores Recall       : [0. 0. 0. 0. 0.]
Cross-validation scores F1           : [0. 0. 0. 0. 0.]
Cross-validation scores Accuracy     : [0.93531599 0.93531599 0.93457249 0.93605948 0.9360119 ]
Mean cross-validation score Precision: 0.0
Mean cross-validation score Recall   : 0.0
Mean cross-validation score F1       : 0.0
Mean cross-validation score Accuracy : 0.9354551690564701
Precision: 0.0
Recall: 0.0
F1 Score: 0.0
Accuracy: 0.9363474122546104
[[1574    0]
 [ 107    0]]


#### 2.3.2.1 Undersampling - Vitals

In [99]:
clf_vitals_under_sampled_default, _, _ = RocketMulti(vitals_grouped,feature_vitals, resampling="Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 11, 96) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.63529412 0.61797753 0.60465116 0.54255319 0.6091954 ]
Cross-validation scores Recall       : [0.62790698 0.63953488 0.60465116 0.59302326 0.61627907]
Cross-validation scores F1           : [0.63157895 0.62857143 0.60465116 0.56666667 0.61271676]
Cross-validation scores Accuracy     : [0.63372093 0.62209302 0.60465116 0.54651163 0.61046512]
Mean cross-validation score Precision: 0.6019342804631712
Mean cross-validation score Recall   : 0.6162790697674418
Mean cross-validation score F1       : 0.6088369936805987
Mean cross-validation score Accuracy : 0.6034883720930232
Precision: 0.09289617486338798
Recall: 0.6355140186915887
F1 Score: 0.16209773539928488
Accuracy: 0.5817965496728138
[[910 664]
 [ 39  68]]


In [100]:
clf_vitals_under_sampled_5, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 5, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 11, 96) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.57723577 0.58208955 0.65789474 0.55833333 0.58088235]
Cross-validation scores Recall       : [0.8255814  0.90697674 0.87209302 0.77906977 0.91860465]
Cross-validation scores F1           : [0.67942584 0.70909091 0.75       0.65048544 0.71171171]
Cross-validation scores Accuracy     : [0.61046512 0.62790698 0.70930233 0.58139535 0.62790698]
Mean cross-validation score Precision: 0.591287149542629
Mean cross-validation score Recall   : 0.8604651162790699
Mean cross-validation score F1       : 0.7001427790032798
Mean cross-validation score Accuracy : 0.6313953488372093
Precision: 0.09021842355175688
Recall: 0.8878504672897196
F1 Score: 0.16379310344827586
Accuracy: 0.4229625223081499
[[616 958]
 [ 12  95]]


In [101]:
clf_vitals_under_sampled_10, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 10, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 11, 96) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.60714286 0.61904762 0.59649123 0.5530303  0.6       ]
Cross-validation scores Recall       : [0.79069767 0.75581395 0.79069767 0.84883721 0.80232558]
Cross-validation scores F1           : [0.68686869 0.68062827 0.68       0.66972477 0.68656716]
Cross-validation scores Accuracy     : [0.63953488 0.64534884 0.62790698 0.58139535 0.63372093]
Mean cross-validation score Precision: 0.595142401458191
Mean cross-validation score Recall   : 0.7976744186046513
Mean cross-validation score F1       : 0.6807577787882604
Mean cross-validation score Accuracy : 0.6255813953488373
Precision: 0.09621621621621622
Recall: 0.8317757009345794
F1 Score: 0.17248062015503876
Accuracy: 0.49196906603212376
[[738 836]
 [ 18  89]]


In [102]:
clf_vitals_under_sampled_25, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 25, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 11, 96) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.60176991 0.62765957 0.55454545 0.65656566 0.56190476]
Cross-validation scores Recall       : [0.79069767 0.68604651 0.70930233 0.75581395 0.68604651]
Cross-validation scores F1           : [0.68341709 0.65555556 0.62244898 0.7027027  0.61780105]
Cross-validation scores Accuracy     : [0.63372093 0.63953488 0.56976744 0.68023256 0.5755814 ]
Mean cross-validation score Precision: 0.6004890717976765
Mean cross-validation score Recall   : 0.7255813953488371
Mean cross-validation score F1       : 0.65638507407953
Mean cross-validation score Accuracy : 0.6197674418604652
Precision: 0.09694989106753812
Recall: 0.8317757009345794
F1 Score: 0.17365853658536584
Accuracy: 0.496133254015467
[[745 829]
 [ 18  89]]


In [103]:
clf_vitals_under_sampled_50, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 50, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 11, 96) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.64444444 0.60909091 0.6344086  0.64285714 0.62921348]
Cross-validation scores Recall       : [0.6744186  0.77906977 0.68604651 0.73255814 0.65116279]
Cross-validation scores F1           : [0.65909091 0.68367347 0.65921788 0.68478261 0.64      ]
Cross-validation scores Accuracy     : [0.65116279 0.63953488 0.64534884 0.6627907  0.63372093]
Mean cross-validation score Precision: 0.6320029163378204
Mean cross-validation score Recall   : 0.7046511627906977
Mean cross-validation score F1       : 0.6653529728538577
Mean cross-validation score Accuracy : 0.6465116279069768
Precision: 0.0859277708592777
Recall: 0.6448598130841121
F1 Score: 0.15164835164835164
Accuracy: 0.5407495538370017
[[840 734]
 [ 38  69]]


In [104]:
clf_vitals_under_sampled_75, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 75, resampling = "Under")

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 11, 96) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.59302326 0.61061947 0.64444444 0.58490566 0.66666667]
Cross-validation scores Recall       : [0.59302326 0.80232558 0.6744186  0.72093023 0.72093023]
Cross-validation scores F1           : [0.59302326 0.69346734 0.65909091 0.64583333 0.69273743]
Cross-validation scores Accuracy     : [0.59302326 0.64534884 0.65116279 0.60465116 0.68023256]
Mean cross-validation score Precision: 0.6199318992657943
Mean cross-validation score Recall   : 0.7023255813953488
Mean cross-validation score F1       : 0.6568304530178422
Mean cross-validation score Accuracy : 0.6348837209302325
Precision: 0.09044585987261146
Recall: 0.6635514018691588
F1 Score: 0.1591928251121076
Accuracy: 0.553837001784652
[[860 714]
 [ 36  71]]


#### 2.3.2.2 oversamplimg for vitals

In [105]:
clf_vitals_over_sampled_default, _, _ = RocketMulti(vitals_grouped,feature_vitals, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.73348694 0.7377839  0.73831124 0.72682236 0.71684588]
Cross-validation scores Recall       : [0.75853852 0.85146942 0.84034948 0.8163752  0.79428118]
Cross-validation scores F1           : [0.74580242 0.79056047 0.78603269 0.76900037 0.7535795 ]
Cross-validation scores Accuracy     : [0.74146148 0.77442415 0.77124702 0.75486691 0.74016687]
Mean cross-validation score Precision: 0.7306500631250846
Mean cross-validation score Recall   : 0.8122027601586541
Mean cross-validation score F1       : 0.7689950918770181
Mean cross-validation score Accuracy : 0.7564332830635713
Precision: 0.11567164179104478
Recall: 0.5794392523364486
F1 Score: 0.192846034214619
Accuracy: 0.6912552052349792
[[1100  474]
 [  45   62]]


In [106]:
clf_vitals_over_sampled_5, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 5, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.58651805 0.58885942 0.60298178 0.57997936 0.5904    ]
Cross-validation scores Recall       : [0.8776807  0.8816521  0.86735504 0.89348172 0.87926926]
Cross-validation scores F1           : [0.70314986 0.70610687 0.71140065 0.70337922 0.70644544]
Cross-validation scores Accuracy     : [0.62946783 0.6330421  0.64813344 0.62336114 0.6344855 ]
Mean cross-validation score Precision: 0.5897477202686126
Mean cross-validation score Recall   : 0.8798877651655299
Mean cross-validation score F1       : 0.7060964079381133
Mean cross-validation score Accuracy : 0.6336980021161897
Precision: 0.09194312796208531
Recall: 0.9065420560747663
F1 Score: 0.16695352839931155
Accuracy: 0.42415229030339086
[[616 958]
 [ 10  97]]


In [107]:
clf_vitals_over_sampled_10, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 10, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.63897973 0.63563004 0.62168675 0.62262997 0.62967431]
Cross-validation scores Recall       : [0.77601271 0.81334392 0.81969817 0.80857824 0.82988871]
Cross-validation scores F1           : [0.70086083 0.71358885 0.70709147 0.70352453 0.71604938]
Cross-validation scores Accuracy     : [0.66878475 0.67355044 0.6604448  0.659118   0.67103695]
Mean cross-validation score Precision: 0.6297201583124179
Mean cross-validation score Recall   : 0.8095043508677111
Mean cross-validation score F1       : 0.708223013645404
Mean cross-validation score Accuracy : 0.6665869860958192
Precision: 0.08833522083805209
Recall: 0.7289719626168224
F1 Score: 0.15757575757575756
Accuracy: 0.5038667459845331
[[769 805]
 [ 29  78]]


In [108]:
clf_vitals_over_sampled_25, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 25, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.68736617 0.67487047 0.68940871 0.66709594 0.69138277]
Cross-validation scores Recall       : [0.76489277 0.82764098 0.84273233 0.82352941 0.82208102]
Cross-validation scores F1           : [0.72406015 0.74348912 0.75839886 0.73710423 0.75108853]
Cross-validation scores Accuracy     : [0.70849881 0.71445592 0.73153296 0.7063965  0.72745332]
Mean cross-validation score Precision: 0.6820248098327812
Mean cross-validation score Recall   : 0.8161753025276829
Mean cross-validation score F1       : 0.7428281785958962
Mean cross-validation score Accuracy : 0.7176675019715024
Precision: 0.09636650868878358
Recall: 0.5700934579439252
F1 Score: 0.16486486486486485
Accuracy: 0.6323616894705533
[[1002  572]
 [  46   61]]


In [109]:
clf_vitals_over_sampled_50, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 50, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.66126856 0.68596141 0.66189857 0.66283006 0.65931213]
Cross-validation scores Recall       : [0.77839555 0.81890389 0.80857824 0.7782194  0.80698967]
Cross-validation scores F1           : [0.71506749 0.74656046 0.72792277 0.71590494 0.72571429]
Cross-validation scores Accuracy     : [0.6898332  0.72200159 0.69777601 0.69129917 0.69487485]
Mean cross-validation score Precision: 0.6662541464000383
Mean cross-validation score Recall   : 0.7982173501820281
Mean cross-validation score F1       : 0.7262339906355637
Mean cross-validation score Accuracy : 0.6991569637821037
Precision: 0.11079545454545454
Recall: 0.7289719626168224
F1 Score: 0.19235511713933415
Accuracy: 0.6103509815585961
[[948 626]
 [ 29  78]]


In [110]:
clf_vitals_over_sampled_75, _, _ = RocketMulti(vitals_grouped,feature_vitals, num_kernels = 75, resampling = "Over")

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.6751269  0.68181818 0.68745571 0.68989071 0.68450704]
Cross-validation scores Recall       : [0.73947577 0.78633836 0.77045274 0.80286169 0.7720413 ]
Cross-validation scores F1           : [0.70583776 0.7303578  0.72659176 0.7421014  0.7256439 ]
Cross-validation scores Accuracy     : [0.6918189  0.70969023 0.71008737 0.72109654 0.7079857 ]
Mean cross-validation score Precision: 0.6837597086362304
Mean cross-validation score Recall   : 0.7742339732621468
Mean cross-validation score F1       : 0.726106522146669
Mean cross-validation score Accuracy : 0.7081357491851281
Precision: 0.10623946037099494
Recall: 0.5887850467289719
F1 Score: 0.17999999999999997
Accuracy: 0.6585365853658537
[[1044  530]
 [  44   63]]


## 2.4 Multimodal

In [111]:
X=static_demo_comorb.drop(['ckd'],axis=1)
y=static_demo_comorb['ckd']

X_onehot=pd.get_dummies(X)
X_train, X_test, y_train, y_test = train_test_split(X_onehot, y, test_size=0.2,stratify = y, random_state=42)

mm_test_ids = X_test['icustay_id']

X_train =X_train.drop(['icustay_id'],axis=1)
X_test  =X_test.drop(['icustay_id'],axis=1)

In [112]:
#### Best model for static - Under sampled Random forest model
rf_mm, weight_static, prob_static = RandomForestForMulti(X_train, X_test, y_train, y_test, resampling='under')

under sampling: Train: 860 Test: 1681
Cross-validation scores Precision    : [0.58333333 0.62068966 0.5875     0.63235294 0.72463768]
Cross-validation scores Recall       : [0.56976744 0.41860465 0.54651163 0.5        0.58139535]
Cross-validation scores F1           : [0.57647059 0.5        0.56626506 0.55844156 0.64516129]
Cross-validation scores Accuracy     : [0.58139535 0.58139535 0.58139535 0.60465116 0.68023256]
Mean cross-validation score Precision: 0.6297027221683276
Mean cross-validation score Recall   : 0.5232558139534884
Mean cross-validation score F1       : 0.5692676994480794
Mean cross-validation score Accuracy : 0.605813953488372
Precision: 0.12280701754385964
Recall: 0.5233644859813084
F1 Score: 0.19893428063943158
Accuracy: 0.7317073170731707
[[1174  400]
 [  51   56]]


In [113]:
#### Best model for labs - Under sampled Rocket model with 50 kernels
clf_lab_mm, weight_lab, prob_lab = RocketMulti(labs_grouped,feature_labs, 50, resampling = "Under", filtered_test_ids = mm_test_ids)

Under sampling: Train: 860 Test: 1681
X_train shape:  (860, 20, 12) 
y_train shape:  (860,)
Cross-validation scores Precision    : [0.63095238 0.61616162 0.64044944 0.59090909 0.60416667]
Cross-validation scores Recall       : [0.61627907 0.70930233 0.6627907  0.60465116 0.6744186 ]
Cross-validation scores F1           : [0.62352941 0.65945946 0.65142857 0.59770115 0.63736264]
Cross-validation scores Accuracy     : [0.62790698 0.63372093 0.64534884 0.59302326 0.61627907]
Mean cross-validation score Precision: 0.6165278385784003
Mean cross-validation score Recall   : 0.6534883720930232
Mean cross-validation score F1       : 0.6338962458881323
Mean cross-validation score Accuracy : 0.6232558139534884
Precision: 0.10735294117647058
Recall: 0.6822429906542056
F1 Score: 0.18551461245235068
Accuracy: 0.6186793575252826
[[967 607]
 [ 34  73]]


In [114]:
#### Best model for vitals - Under sampled Rocket model
clf_vital_mm, weight_vital, prob_vital = RocketMulti(vitals_grouped,feature_vitals, resampling = "Over", filtered_test_ids = mm_test_ids)

Over sampling: Train: 12588 Test: 1681
X_train shape:  (12588, 11, 96) 
y_train shape:  (12588,)
Cross-validation scores Precision    : [0.73007519 0.74135546 0.72364672 0.73706294 0.71722718]
Cross-validation scores Recall       : [0.77124702 0.85146942 0.80698967 0.83783784 0.83002383]
Cross-validation scores F1           : [0.75009656 0.79260628 0.76304919 0.78422619 0.76951399]
Cross-validation scores Accuracy     : [0.74305004 0.77720413 0.74940429 0.76956694 0.75129122]
Mean cross-validation score Precision: 0.7298734982323919
Mean cross-validation score Recall   : 0.8195135564476311
Mean cross-validation score F1       : 0.7718984442633767
Mean cross-validation score Accuracy : 0.758103324715209
Precision: 0.11089866156787763
Recall: 0.5420560747663551
F1 Score: 0.18412698412698414
Accuracy: 0.6942296252230815
[[1109  465]
 [  49   58]]


In [115]:
def soft_voting(clf_static, clf_lab, clf_vital, weight_static, weight_lab, weight_vital,
               prob_static, prob_lab, prob_vital, y_test):
    
    weights = np.array([weight_static, weight_lab, weight_vital])
    shift_positive = abs(weights.min()) + 2
    weight_static += shift_positive
    weight_lab += shift_positive
    weight_vital += shift_positive
    
    weighted_prob = ((weight_static * prob_static) +  (weight_lab * prob_lab) + (weight_vital * prob_vital)) / np.sum([weight_static, weight_lab, weight_vital])

    y_pred = np.argmax(weighted_prob, axis=1)
    
    metricsReport(y_test, y_pred)

In [116]:
print("weight_static",weight_static)
print("weight_lab",weight_lab)
print("weight_vital",weight_vital) 

weight_static -1.392968468207584
weight_lab -1.4794228352123338
weight_vital -1.4886330743488003


In [117]:
soft_voting(rf_mm, clf_lab_mm, clf_vital_mm, weight_static, weight_lab, weight_vital,
               prob_static, prob_lab, prob_vital, y_test)

Precision: 0.07976653696498054
Recall: 0.38317757009345793
F1 Score: 0.13204508856682767
Accuracy: 0.6793575252825699


In [118]:
def RockyOldWrong(grouped_data_labs, feature_labs):
    X = []
    y = []
    for _, group in grouped_data_labs:
        group_values = group[feature_labs].values.T            
        X.append(group_values)
        y.append(group['ckd'].iloc[0])        
        
    X = np.array(X)
    y = np.array(y)  
        
    n_samples, n_features, n_channels = X.shape
    X_2d = X.reshape((n_samples, n_features*n_channels))
    rusLab = SMOTE(random_state=42)
    X_resampled, y_resampled = rusLab.fit_resample(X_2d, y)
    X_resampled = X_resampled.reshape((X_resampled.shape[0], n_features, n_channels))    
    X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2)
    
    rocket = Rocket(num_kernels=100)
    rocket.fit(X_train)
    X_train_transformed = rocket.transform(X_train)
    X_test_transformed = rocket.transform(X_test)
    clf = LogisticRegression()
    clf.fit(X_train_transformed, y_train)
    y_pred = clf.predict(X_test_transformed)
    
    metricsReport(y_test, y_pred)
    print(classification_report(y_test, y_pred))
    print(confusion_matrix(y_test, y_pred))
    
RockyOldWrong(labs_grouped,feature_labs)

Precision: 0.7545830869308102
Recall: 0.8010043942247332
F1 Score: 0.7771010962241169
Accuracy: 0.7674714104193139
              precision    recall  f1-score   support

           0       0.78      0.73      0.76      1555
           1       0.75      0.80      0.78      1593

    accuracy                           0.77      3148
   macro avg       0.77      0.77      0.77      3148
weighted avg       0.77      0.77      0.77      3148

[[1140  415]
 [ 317 1276]]
