In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.model_selection import GroupKFold


In [3]:
def missing_values_table(df):
    mis_val = df.isnull().sum()
    mis_val_percent = 100 * df.isnull().sum() / len(df)
    mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
    mis_val_table_ren_columns = mis_val_table.rename(
    columns = {0 : 'Missing Values', 1 : '% of Total Values'})
    mis_val_table_ren_columns = mis_val_table_ren_columns[
        mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
    '% of Total Values', ascending=False).round(1)
    print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
        "There are " + str(mis_val_table_ren_columns.shape[0]) +
            " columns that have missing values.")
    return mis_val_table_ren_columns

load lab data and preprocessed text with granular labels
select alle positiven, davon die, die granular label 0 bis step 4. Davon alle die, die bis granular label 16 (12h später) 1 haben. Diese Ids sind die positiven die interessant sind. Mit allen negativen zusammen, train/test split. Vom Train/test split trainIDs to csv, und testIDs to csv.

In [4]:
lab = pd.read_csv("patients_no_text_sepsis_labels.csv")
text = pd.read_csv("noteevents_preprocessed_text_granular_label.csv")

In [5]:
missing_values_table(lab)

Your selected dataframe has 57 columns.
There are 3 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
o:output_4hourly,1102706,55.4
o:output_total,1102706,55.4
o:PaO2_FiO2,889233,44.6


In [6]:
missing_values_table(text)

Your selected dataframe has 16 columns.
There are 5 columns that have missing values.


Unnamed: 0,Missing Values,% of Total Values
iserror,1310775,99.9
storetime,583274,44.5
charttime,220971,16.8
hadm_id,94169,7.2
clean_text,84,0.0


In [7]:
text.dropna(subset=['clean_text'], inplace=True)

In [12]:
lab.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'a:action', 'm:charttime', 'm:icustayid',
       'm:presumed_onset', 'o:Arterial_BE', 'o:Arterial_lactate',
       'o:Arterial_pH', 'o:BUN', 'o:Calcium', 'o:Chloride', 'o:Creatinine',
       'o:DiaBP', 'o:FiO2_1', 'o:GCS', 'o:Glucose', 'o:HCO3', 'o:HR', 'o:Hb',
       'o:INR', 'o:Magnesium', 'o:MeanBP', 'o:PT', 'o:PTT', 'o:PaO2_FiO2',
       'o:Platelets_count', 'o:Potassium', 'o:RR', 'o:SGOT', 'o:SGPT',
       'o:SIRS', 'o:SOFA', 'o:Shock_Index', 'o:Sodium', 'o:SpO2', 'o:SysBP',
       'o:Temp_C', 'o:Total_bili', 'o:WBC_count', 'o:Weight_kg', 'o:age',
       'o:cumulated_balance', 'o:gender', 'o:input_4hourly', 'o:input_total',
       'o:max_dose_vaso', 'o:mechvent', 'o:output_4hourly', 'o:output_total',
       'o:paCO2', 'o:paO2', 'o:re_admission', 'r:reward', 'step', 'traj',
       'sepsis_label'],
      dtype='object')

In [13]:
text.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'subject_id', 'hadm_id', 'category',
       'description', 'iserror', 'text', 'charttime', 'chartdate', 'storetime',
       'm:icustayid', 'sepsis_label', 'm:presumed_onset', 'clean_text',
       'granular_label'],
      dtype='object')

create stepwise label for lab data. If charttime < presumed onset == 0, else 1 (for patients that are flagged as septic).

In [8]:
lab["granular_label"] = np.where(((lab["m:charttime"] >= lab["m:presumed_onset"]) & (lab["m:presumed_onset"] > 0.0)), 1, 0)

In [9]:
lab_id = np.unique(lab["m:icustayid"])
text_id = np.unique(text["m:icustayid"])
full_IDs = list(set(lab_id) & set(text_id))
len(full_IDs)

41774

In [10]:
lab = lab.loc[lab["m:icustayid"].isin(full_IDs)]
text = text.loc[text["m:icustayid"].isin(full_IDs)]

select all positive patients, then set flag if still negative after 4 hours.

In [11]:
lab_neg = lab.loc[lab["sepsis_label"]==0]
lab_pos = lab.loc[lab["sepsis_label"]==1]
lab_pos["0_4hrs"] = np.where(((lab_pos["step"] <= 4) & (lab_pos["granular_label"] == 0)), 1, 0)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_pos["0_4hrs"] = np.where(((lab_pos["step"] <= 4) & (lab_pos["granular_label"] == 0)), 1, 0)


select positive patients which are negative after 4 hours

In [12]:
lab_pos_0_4hrs_Ids = np.unique(lab_pos["m:icustayid"].loc[lab_pos["0_4hrs"]==1])
lab_pos_0_4hrs = lab_pos.loc[lab_pos["m:icustayid"].isin(lab_pos_0_4hrs_Ids)]

for those patients, set flag if patients get positive within next 12 hours

In [13]:
lab_pos_0_4hrs["1_16hrs"] = np.where(((lab_pos_0_4hrs["step"] <= 16) & (lab_pos_0_4hrs["granular_label"] == 1)), 1, 0)
lab_pos_04_116_Ids = np.unique(lab_pos_0_4hrs["m:icustayid"].loc[lab_pos_0_4hrs["1_16hrs"]==1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_pos_0_4hrs["1_16hrs"] = np.where(((lab_pos_0_4hrs["step"] <= 16) & (lab_pos_0_4hrs["granular_label"] == 1)), 1, 0)


In [14]:
print("{} patients that are negative after 4 hours but develop sepsis within the next 12 hours".format(len(lab_pos_04_116_Ids)))

7369 patients that are negative after 4 hours but develop sepsis within the next 12 hours


select those patients from lab_pos, combine with all negative patients and do train/test split.

In [15]:
lab_neg_IDs = np.unique(lab_neg["m:icustayid"])
IDs_negative_sep_develop = list(lab_pos_04_116_Ids) + list(lab_neg_IDs)
lab_full_of_interest = lab.loc[lab["m:icustayid"].isin(IDs_negative_sep_develop)]

In [16]:
len(IDs_negative_sep_develop)

31170

In [17]:
lab_full_of_interest["min_len_16"] = np.where(lab_full_of_interest["step"] >= 16, 1, 0)
min_len_16_IDs = np.unique(lab_full_of_interest["m:icustayid"].loc[lab_full_of_interest["min_len_16"]==1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_full_of_interest["min_len_16"] = np.where(lab_full_of_interest["step"] >= 16, 1, 0)


In [18]:
lab_full_min_16 = lab_full_of_interest.loc[lab_full_of_interest["m:icustayid"].isin(min_len_16_IDs)]

check that each patient has atleast 16 hours of data

In [19]:
Counter(lab_full_min_16["step"])

Counter({0: 30502,
         1: 30502,
         2: 30502,
         3: 30502,
         4: 30502,
         5: 30502,
         6: 30502,
         7: 30502,
         8: 30502,
         9: 30502,
         10: 30502,
         11: 30502,
         12: 30502,
         13: 30502,
         14: 30502,
         15: 30502,
         16: 30502,
         17: 30185,
         18: 29819,
         19: 29399,
         20: 28902,
         21: 28346,
         22: 27766,
         23: 27185,
         24: 26493,
         25: 25756,
         26: 25002,
         27: 24152,
         28: 23446,
         29: 22652,
         30: 21999,
         31: 21386,
         32: 20807,
         33: 20261,
         34: 19765,
         35: 19295,
         36: 18858,
         37: 18445,
         38: 18061,
         39: 17680,
         40: 17333,
         41: 16924,
         42: 16504,
         43: 16057,
         44: 15615,
         45: 15176,
         46: 14675,
         47: 14187,
         48: 13715,
         49: 13238,
         5

In [20]:
lab_full_min_16.drop(['min_len_16'], axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  lab_full_min_16.drop(['min_len_16'], axis=1, inplace=True)


In [21]:
lab_full_min_16.columns

Index(['Unnamed: 0.1', 'Unnamed: 0', 'a:action', 'm:charttime', 'm:icustayid',
       'm:presumed_onset', 'o:Arterial_BE', 'o:Arterial_lactate',
       'o:Arterial_pH', 'o:BUN', 'o:Calcium', 'o:Chloride', 'o:Creatinine',
       'o:DiaBP', 'o:FiO2_1', 'o:GCS', 'o:Glucose', 'o:HCO3', 'o:HR', 'o:Hb',
       'o:INR', 'o:Magnesium', 'o:MeanBP', 'o:PT', 'o:PTT', 'o:PaO2_FiO2',
       'o:Platelets_count', 'o:Potassium', 'o:RR', 'o:SGOT', 'o:SGPT',
       'o:SIRS', 'o:SOFA', 'o:Shock_Index', 'o:Sodium', 'o:SpO2', 'o:SysBP',
       'o:Temp_C', 'o:Total_bili', 'o:WBC_count', 'o:Weight_kg', 'o:age',
       'o:cumulated_balance', 'o:gender', 'o:input_4hourly', 'o:input_total',
       'o:max_dose_vaso', 'o:mechvent', 'o:output_4hourly', 'o:output_total',
       'o:paCO2', 'o:paO2', 'o:re_admission', 'r:reward', 'step', 'traj',
       'sepsis_label', 'granular_label'],
      dtype='object')

split data

GroupKFold is a variation of k-fold which ensures that the same group is not represented in both testing and training sets. For example if the data is obtained from different subjects with several samples per-subject and if the model is flexible enough to learn from highly person specific features it could fail to generalize to new subjects. GroupKFold makes it possible to detect this kind of overfitting situations.

patients are groups and i make sure that one patient is not in both train and test.

In [22]:
gkf = GroupKFold(n_splits=4)
groups = lab_full_min_16["m:icustayid"].to_numpy()
y = lab_full_min_16["sepsis_label"].to_numpy()
X = lab_full_min_16.drop(['sepsis_label', 'm:icustayid', 'granular_label'], axis=1).to_numpy()


In [23]:
for i, (train_index, test_index) in enumerate(gkf.split(X, groups=groups)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}, group={groups[train_index]}")
    print(f"  Test:  index={test_index}, group={groups[test_index]}")
    X_train = lab_full_min_16.iloc[train_index]
    y_train = y[train_index]
    X_test_ = lab_full_min_16.iloc[test_index]
    y_test_ = y[test_index]


Fold 0:
  Train: index=[      0       1       2 ... 1404570 1404571 1404572], group=[200011. 200011. 200011. ... 299999. 299999. 299999.]
  Test:  index=[    468     469     470 ... 1404327 1404328 1404329], group=[200135. 200135. 200135. ... 299972. 299972. 299972.]
Fold 1:
  Train: index=[      0       1       2 ... 1404570 1404571 1404572], group=[200011. 200011. 200011. ... 299999. 299999. 299999.]
  Test:  index=[     96      97      98 ... 1404451 1404452 1404453], group=[200066. 200066. 200066. ... 299979. 299979. 299979.]
Fold 2:
  Train: index=[      0       1       2 ... 1404570 1404571 1404572], group=[200011. 200011. 200011. ... 299999. 299999. 299999.]
  Test:  index=[    197     198     199 ... 1404477 1404478 1404479], group=[200068. 200068. 200068. ... 299987. 299987. 299987.]
Fold 3:
  Train: index=[     96      97      98 ... 1404477 1404478 1404479], group=[200066. 200066. 200066. ... 299987. 299987. 299987.]
  Test:  index=[      0       1       2 ... 1404570 140457

In [24]:

print(len(X_test_) / len(X_train), "ratio of test/train data")
print("Class balance in train data rows: {}".format(Counter(y_train)))
print("Class balance in test data rows: {}".format(Counter(y_test_)))

0.33334314263419595 ratio of test/train data
Class balance in train data rows: Counter({0: 817024, 1: 236398})
Class balance in test data rows: Counter({0: 272461, 1: 78690})


In [25]:
train_IDs = np.unique(X_train["m:icustayid"])
test_IDs = np.unique(X_test_["m:icustayid"])
print(len(train_IDs), len(test_IDs))
np.intersect1d(train_IDs,test_IDs)

22876 7626


array([], dtype=float64)

split the test data again to get a validation set

In [26]:
gkf_val = GroupKFold(n_splits=3)
groups_ = X_test_["m:icustayid"].to_numpy()
y_ = X_test_["sepsis_label"].to_numpy()
X_ = X_test_.drop(['sepsis_label', 'm:icustayid', 'granular_label'], axis=1).to_numpy()

In [27]:
for i, (train_index, test_index) in enumerate(gkf_val.split(X_, groups=groups_)):
    print(f"Fold {i}:")
    print(f"  Train: index={train_index}, group={groups[train_index]}")
    print(f"  Test:  index={test_index}, group={groups[test_index]}")
    X_test = X_test_.iloc[train_index]
    y_test = y_[train_index]
    X_val = X_test_.iloc[test_index]
    y_val = y_[test_index]

Fold 0:
  Train: index=[    42     43     44 ... 351123 351124 351125], group=[200063. 200063. 200063. ... 203330. 203330. 203330.]
  Test:  index=[     0      1      2 ... 351148 351149 351150], group=[200011. 200011. 200011. ... 203330. 203330. 203330.]
Fold 1:
  Train: index=[     0      1      2 ... 351148 351149 351150], group=[200011. 200011. 200011. ... 203330. 203330. 203330.]
  Test:  index=[    96     97     98 ... 350511 350512 350513], group=[200066. 200066. 200066. ... 203277. 203277. 203277.]
Fold 2:
  Train: index=[     0      1      2 ... 351148 351149 351150], group=[200011. 200011. 200011. ... 203330. 203330. 203330.]
  Test:  index=[    42     43     44 ... 351123 351124 351125], group=[200063. 200063. 200063. ... 203330. 203330. 203330.]


In [28]:

print(len(X_val) / len(X_test), "ratio of val/test data")
print("Class balance in test data rows: {}".format(Counter(y_test)))
print("Class balance in val data rows: {}".format(Counter(y_val)))

0.4999978641697387 ratio of val/test data
Class balance in test data rows: Counter({0: 181939, 1: 52162})
Class balance in val data rows: Counter({0: 90522, 1: 26528})


In [29]:
test_IDs = np.unique(X_test["m:icustayid"])
val_IDs = np.unique(X_val["m:icustayid"])
print(np.intersect1d(train_IDs,test_IDs))
print(np.intersect1d(train_IDs,val_IDs))
print(np.intersect1d(test_IDs,val_IDs))

[]
[]
[]


In [39]:
list(test_IDs)

[200011.0,
 200062.0,
 200079.0,
 200112.0,
 200224.0,
 200232.0,
 200281.0,
 200297.0,
 200298.0,
 200360.0,
 200361.0,
 200364.0,
 200379.0,
 200386.0,
 200387.0,
 200392.0,
 200430.0,
 200450.0,
 200509.0,
 200535.0,
 200557.0,
 200566.0,
 200575.0,
 200637.0,
 200661.0,
 200677.0,
 200684.0,
 200686.0,
 200748.0,
 200754.0,
 200777.0,
 200799.0,
 200824.0,
 200845.0,
 200851.0,
 200873.0,
 200893.0,
 200902.0,
 200931.0,
 200932.0,
 200934.0,
 200947.0,
 200954.0,
 200961.0,
 200983.0,
 200993.0,
 201001.0,
 201018.0,
 201020.0,
 201024.0,
 201028.0,
 201043.0,
 201095.0,
 201096.0,
 201160.0,
 201164.0,
 201173.0,
 201184.0,
 201206.0,
 201211.0,
 201220.0,
 201261.0,
 201264.0,
 201271.0,
 201324.0,
 201326.0,
 201371.0,
 201400.0,
 201401.0,
 201444.0,
 201449.0,
 201466.0,
 201483.0,
 201543.0,
 201569.0,
 201584.0,
 201585.0,
 201587.0,
 201746.0,
 201756.0,
 201866.0,
 201876.0,
 201877.0,
 201881.0,
 201893.0,
 201939.0,
 201951.0,
 201970.0,
 201982.0,
 202010.0,
 202019.0,

In [53]:
t = pd.DataFrame(train_IDs)
te = pd.DataFrame(test_IDs)
tes = pd.DataFrame(val_IDs)
ID = pd.concat([t, te, tes],axis=1)
IDs = ID.set_axis(["train_ids", "test_ids", "val_ids"], axis=1)
IDs

Unnamed: 0,train_ids,test_ids,val_ids
0,200006.0,200011.0,200007.0
1,200010.0,200062.0,200009.0
2,200016.0,200079.0,200028.0
3,200021.0,200112.0,200036.0
4,200025.0,200224.0,200063.0
...,...,...,...
22871,299971.0,,
22872,299972.0,,
22873,299979.0,,
22874,299987.0,,


In [54]:
IDs.to_csv("IDs.csv")