In [None]:
import os
import pandas as pd
from sklearn.model_selection import train_test_split

### Directory Setup
- Store the feature sets for all ICU admissions in `data_dir`.
- Save the model outputs in `output_dir`.

In [3]:
date = "20250420"
data_dir = '../data/'
output_dir = '../output/'

#### 1. Create DataFrame
- Load the CSV file, perform random sampling of `laboratory_measurements`, and convert it into a unique DataFrame based on `icu_stay_id` and `time_window`.

In [4]:
df_dic_prediction_all = pd.read_csv(os.path.join(data_dir, date, f'{date}.csv'))
df_dic_prediction_all["icu_stay_id"] = df_dic_prediction_all["icu_stay_id"].astype("int64")

In [5]:
#confirm number of total unique ids
len(df_dic_prediction_all['icu_stay_id'].unique())

7532

#### 2. Random Sampling of Laboratory Measurements
Only the laboratory measurements are retrieved without aggregation within a specified time series interval.
Group the data by `icu_stay_id` and `time_window_index`, and randomly select one record from each group.

In [15]:
def random_select_from_time_window(group: pd.DataFrame) -> pd.DataFrame:
    return group.groupby(['icu_stay_id', 'time_window_index']).apply(lambda x: x.sample(n=1, random_state=42)).reset_index(drop=True)

In [16]:
df_dic_sampled = random_select_from_time_window(df_dic_prediction_all)
df_dic_sampled = df_dic_sampled.where(pd.notnull(df_dic_sampled), None)

  return group.groupby(['icu_stay_id', 'time_window_index']).apply(lambda x: x.sample(n=1, random_state=42)).reset_index(drop=True)


#### 3. Retrieve Variables used for models from the Feature Set

In [None]:
features = pd.read_csv(os.path.join(data_dir,'featureset.csv'))

ids_and_variables_for_statistics = [ 'icu_stay_id', 'hospital_id', 'in_time', 'out_time', 'start_time', 'end_time',
                                    'label_dic_diagnosis', 'isth_dic_score', 'sofa_score', 'sic_score', 'body_weight']

# retrieve variables for 'minimum model', 'compact model', 'full model' 
minimum_model_features = features[features['minimum model'] == 1]['Feature'].tolist()
compact_model_features = features[features['compact model'] == 1]['Feature'].tolist()
full_model_features = features[features['full model'] == 1]['Feature'].tolist()

minimum_model_columns_all = ids_and_variables_for_statistics + minimum_model_features
compact_model_columns_all = ids_and_variables_for_statistics + compact_model_features
full_model_columns_all = ids_and_variables_for_statistics + full_model_features


#### 4. Stratification => Fix `training_id` and `test_id`
- Stratification is performed so that the proportion of data with `label=1` within a specific `time_window_index` is approximately equal between the train and test datasets.

In [None]:
def extract_df(df: pd.DataFrame, dic_flag: int, tw: int):
    # Group by `icu_stay_id` and retrieve the data where the maximum `time_window_index` equals `tw`
    filtered_tw = df[df.groupby('icu_stay_id')['time_window_index'].transform('max') == tw]
    # Retrieve the data where `tw` is the maximum value and the label matches `dic_flag`
    filtered_dic = filtered_tw[(filtered_tw['label_dic_diagnosis']==dic_flag) & (filtered_tw['time_window_index']==tw)]  
    return filtered_dic.reset_index(drop=True)

def split_on_patients(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    unique_ids = df['icu_stay_id'].unique()
    # Split the unique icu_stay_id into train and test sets
    train_ids, test_ids = train_test_split(unique_ids, test_size=0.2, random_state=710)
    return train_ids, test_ids

def stratification(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.DataFrame]:
    train_ids = []
    test_ids = []
    for i in range(df['time_window_index'].max()+1):
        dic = extract_df(df,1,i)
        non_dic = extract_df(df,0,i)
        dic_train_ids,dic_test_ids = split_on_patients(dic)
        nondic_train_ids,nondic_test_ids = split_on_patients(non_dic)
        train_ids = train_ids + dic_train_ids.tolist() + nondic_train_ids.tolist()
        test_ids = test_ids + dic_test_ids.tolist() + nondic_test_ids.tolist()
        
    return train_ids,test_ids

In [209]:
train_ids,test_ids = stratification(df_dic_sampled)

In [210]:
#Check for stratification
train_df = df_dic_sampled[df_dic_sampled['icu_stay_id'].isin(train_ids)]
test_df = df_dic_sampled[df_dic_sampled['icu_stay_id'].isin(test_ids)]

def check_stratification(train_df, test_df):
    for i in range(7):
        print(f"tw={i} instances dic/all train: {len(train_df[(train_df['time_window_index'] == i) & (train_df['label_dic_diagnosis'] == 1)])}/{len(train_df[train_df['time_window_index'] == i])}={len(train_df[(train_df['time_window_index'] == i) & (train_df['label_dic_diagnosis'] == 1)]) / len(train_df[train_df['time_window_index'] == i]):.4f} | test: {len(test_df[(test_df['time_window_index'] == i) & (test_df['label_dic_diagnosis'] == 1)])}/{len(test_df[test_df['time_window_index'] == i])}={len(test_df[(test_df['time_window_index'] == i) & (test_df['label_dic_diagnosis'] == 1)]) / len(test_df[test_df['time_window_index'] == i]):.4f}")

check_stratification(train_df,test_df)


tw=0 instances dic/all train: 300/6020=0.0498 | test: 76/1512=0.0503
tw=1 instances dic/all train: 120/4440=0.0270 | test: 30/1116=0.0269
tw=2 instances dic/all train: 68/3620=0.0188 | test: 17/911=0.0187
tw=3 instances dic/all train: 54/3012=0.0179 | test: 14/759=0.0184
tw=4 instances dic/all train: 34/2532=0.0134 | test: 9/638=0.0141
tw=5 instances dic/all train: 21/2154=0.0097 | test: 6/542=0.0111
tw=6 instances dic/all train: 13/1849=0.0070 | test: 4/464=0.0086


#### 5.Column name conversion for output

In [None]:
column_mapping = {
    'female': 'Female',
    'height': 'Height',
    'age': 'Age',
    'time_window_index': 'Time window index',
    'infected_nervous_system': 'Nervous system infection',
    'infected_cardiovascular': 'Cardiovascular infection',
    'infected_respiratory': 'Respiratory infection',
    'infected_abdomen': 'Abdominal infection',
    'infected_urinary_tract': 'Urinary tract infection',
    'infected_soft_tissue': 'Soft tissue infection',
    'infected_other': 'Other infection',
    'charlson_comorbidity_index': 'Charlson comorbidity index',
    'congestive_heart_failure': 'Congestive heart failure',
    'chronic_pulmonary_disease': 'Chronic pulmonary disease',
    'mild_liver_disease': 'Mild liver disease',
    'severe_liver_disease': 'Severe liver disease',
    'diabetes_without_cc': 'Diabetes without chronic complication',
    'diabetes_with_cc': 'Diabetes with chronic complication',
    'renal_disease': 'Renal disease',
    'malignant_cancer': 'Malignancy',
    'metastatic_solid_tumor': 'Metastatic solid tumor',
    'ph': 'pH',
    'base_excess': 'Base excess',
    'lactate': 'Lactate',
    'glucose': 'Glucose',
    'wbc': 'WBC',
    'hemoglobin': 'Hemoglobin',
    'platelet': 'Platelet',
    'creatinine': 'Creatinine',
    'total_bilirubin': 'Total bilirubin',
    'crp': 'CRP',
    'albumin': 'Albumin',
    'aptt': 'APTT',
    'ptinr': 'PT-INR',
    'd_dimer': 'D-dimer',
    'fibrinogen': 'Fibrinogen',
    'fdp': 'FDP',
    'pfratio': 'PaO2/FiO2 ratio',
    'bt50': 'BT (median)',
    'bt90': 'BT (90 percentile)',
    'hr10': 'HR (10 percentile)',
    'hr50': 'HR (median)',
    'hr90': 'HR (90 percentile)',
    'hr_sd': 'HR (sd)',
    'rr10': 'RR (10 percentile)',
    'rr50': 'RR (median)',
    'rr90': 'RR (90 percentile)',
    'rr_sd': 'RR (sd)',
    'sbp10': 'SBP (10 percentile)',
    'sbp50': 'SBP (median)',
    'sbp90': 'SBP (90 percentile)',
    'sbp_sd': 'SBP (sd)',
    'mbp10': 'MBP (10 percentile)',
    'mbp50': 'MBP (median)',
    'mbp90': 'MBP (90 percentile)',
    'mbp_sd': 'MBP (sd)',
    'dbp10': 'DBP (10 percentile)',
    'dbp50': 'DBP (median)',
    'dbp90': 'DBP (90 percentile)',
    'dbp_sd': 'DBP (sd)',
    'spo2_10': 'SpO2 (10 percentile)',
    'spo2_50': 'SpO2 (median)',
    'spo2_90': 'SpO2 (90 percentile)',
    'spo2_sd': 'SpO2 (sd)',
    'urine_output_rate': 'Urine output (rate)',
    'infusion_rate': 'Infusion (rate)',
    'gcs_e': 'GCS (E)',
    'gcs_v': 'GCS (V)',
    'gcs_m': 'GCS (M)',
    'adrenaline': 'Adrenaline (rate)',
    'noradrenaline': 'Noradrenaline (rate)',
    'vasopressin': 'Vasopressin (rate)'
}

def update_column_names(df):
    new_columns = [column_mapping.get(col, col) for col in df.columns]
    df.columns = new_columns
    return df

derivation_model1 = update_column_names(df_dic_sampled)

In [None]:
derivation_model1.to_csv(os.path.join(data_dir, date, f'{date}_derivation_model1.csv'))

#### 5-1 Check the Condition of the Features

In [None]:
int64_columns_1 = [col for col in derivation_model1.columns if derivation_model1[col].dtype == 'int64']
int64_columns_1.remove("icu_stay_id")
derivation_model1[int64_columns_1] = derivation_model1[int64_columns_1].astype('int32')


In [None]:
float64_columns_1 = [col for col in derivation_model1.columns if derivation_model1[col].dtype == 'float64']
derivation_model1[float64_columns_1] = derivation_model1[float64_columns_1].astype('float32')

In [None]:
# Resolve fragmentation by copying
derivation_model1 = derivation_model1.copy()

In [None]:
train_df_model1 = derivation_model1[derivation_model1['icu_stay_id'].isin(train_ids)].rename(columns={'label_dic_diagnosis':'y'})
test_df_model1 = derivation_model1[derivation_model1['icu_stay_id'].isin(test_ids)].rename(columns={'label_dic_diagnosis':'y'})

In [None]:
# Extract the data of `icu_stay_id` that has taken the value y=1 (DIC onset) at least once
dic_train = train_df_model1[train_df_model1.groupby('icu_stay_id')['y'].transform('max') == 1]
dic_test = test_df_model1[test_df_model1.groupby('icu_stay_id')['y'].transform('max') == 1]
# Extract the data of `icu_stay_id` that has never taken the value y=1 (no DIC onset)
nondic_train = train_df_model1[train_df_model1.groupby('icu_stay_id')['y'].transform('max') == 0]
nondic_test = test_df_model1[test_df_model1.groupby('icu_stay_id')['y'].transform('max') == 0]
# Merge the data respectively
dic_df_model1 = pd.concat([dic_train, dic_test])
nondic_df_model1 = pd.concat([nondic_train, nondic_test])

In [None]:
#check the number of dic and non-dic unique ids 
print("---model1 DIC---")
print("Training(ids): ",len(train_df_model1[train_df_model1['y'] == 1]['icu_stay_id'].unique()), "  Test(ids): ",len(test_df_model1[test_df_model1['y'] == 1]['icu_stay_id'].unique()))
print("---model1 non DIC---")
print("Training(ids): ",len(train_df_model1.groupby('icu_stay_id')['y'].max()[train_df_model1.groupby('icu_stay_id')['y'].max() == 0].index), "  Test(ids): ",len(test_df_model1.groupby('icu_stay_id')['y'].max()[test_df_model1.groupby('icu_stay_id')['y'].max() == 0].index))


---model1 DIC---
Training(ids):  610   Test(ids):  156
---model1 non DIC---
Training(ids):  5410   Test(ids):  1356


In [None]:
#save as csv
train_df_model1.to_csv(os.path.join(data_dir, date,f'{date}_train_df_model1.csv'))
test_df_model1.to_csv(os.path.join(data_dir, date, f'{date}_test_df_model1.csv'))
dic_df_model1.to_csv(os.path.join(data_dir, date,f'{date}_dic_df_model1.csv'))
nondic_df_model1.to_csv(os.path.join(data_dir, date, f'{date}_nondic_df_model1.csv'))