In [1]:
import numpy as np
import pandas as pd
import tqdm
import argparse
import os
from ai_clinician.modeling.normalization import DataNormalization
from ai_clinician.preprocessing.utils import load_csv
from ai_clinician.preprocessing.columns import *
from ai_clinician.modeling.columns import *
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

tqdm.tqdm.pandas()

def save_data_files(dir, MIMICraw, MIMICzs, metadata):
    MIMICraw.to_csv(os.path.join(dir, "MIMICraw.csv"), index=False)
    MIMICzs.to_csv(os.path.join(dir, "MIMICzs.csv"), index=False)
    metadata.to_csv(os.path.join(dir, "metadata.csv"), index=False)


'/home/lkapral/RRT_mimic_iv'

In [3]:
def create_args():
    parser = argparse.ArgumentParser(description=(
        'Generates a train/test split of the MIMIC-IV dataset, and generates files labeled '
        '{train|test}/MIMICraw.npy and {train|test}/MIMICzs.npy.'
    ))
    parser.add_argument('input', type=str,
                        help='Data directory (should contain mimic_dataset.csv and aki_cohort.csv)')
    parser.add_argument('output', type=str,
                        help='Directory in which to output')
    parser.add_argument('--train-size', dest='train_size', type=float, default=0.7,
                        help='Proportion of data to use in training (default 0.7)')
    parser.add_argument('--outcome', dest='outcome_col', type=str, default='died_in_hosp',
                        help='Name of column to use for outcomes (probably "died_in_hosp" [default] or "morta_90")')
    
    # Simulate input arguments as if they were passed from the command line
    simulated_input = [
        '/home/lkapral/RRT_mimic_iv/data/mimic',    # Replace with your actual input directory
        '/home/lkapral/RRT_mimic_iv/data/model',   # Replace with your actual output directory
        '--train-size', '0.7',
        '--outcome', 'died_in_hosp'
    ]
    return parser.parse_args(simulated_input)

# Create args object
args = create_args()

in_dir = args.input
out_dir = args.output
if not os.path.exists(out_dir):
    os.mkdir(out_dir)

# Find sepsis cohort in the mimic dataset
mdp_data = load_csv(os.path.join(in_dir, "mimic_dataset.csv"))
aki_cohort = load_csv(os.path.join(in_dir, "aki_cohort.csv"))

print(list(mdp_data.columns))


MIMICtable = mdp_data[mdp_data[C_ICUSTAYID].isin(aki_cohort[C_ICUSTAYID])].reset_index(drop=True)
assert args.outcome_col in MIMICtable.columns, "Outcome column '{}' not found in MIMICtable".format(args.outcome_col)



# Define RRT-related columns
rrt_cols = [
    'Ultrafiltrate_Output',
    'Blood_Flow',
    'Hourly_Patient_Fluid_Removal',
    'Dialysate_Rate',
    'Hemodialysis_Output',  # Ensure the column name matches your DataFrame
    'Citrate',
    'Prefilter_Replacement_Rate',
    'Postfilter_Replacement_Rate'
]





['bloc', 'icustayid', 'timestep', 'gender', 'age', 'elixhauser', 're_admission', 'died_in_hosp', 'died_within_48h_of_out_time', 'morta_90', 'delay_end_of_record_and_discharge_or_death', 'Height_cm', 'Weight_kg', 'GCS', 'RASS', 'HR', 'SysBP', 'MeanBP', 'DiaBP', 'RR', 'SpO2', 'Temp_C', 'Temp_F', 'CVP', 'PAPsys', 'PAPmean', 'PAPdia', 'CI', 'SVR', 'Interface', 'FiO2_100', 'FiO2_1', 'O2flow', 'PEEP', 'TidalVolume', 'MinuteVentil', 'PAWmean', 'PAWpeak', 'PAWplateau', 'Respiratory_Rate', 'Ultrafiltrate_Output', 'Blood_Flow', 'Hourly_Patient_Fluid_Removal', 'Dialysate_Rate', 'APACHEII_Renal_Failure', 'Hemodialysis_Output', 'Citrate', 'Prefilter_Replacement_Rate', 'Postfilter_Replacement_Rate', 'Potassium', 'Sodium', 'Chloride', 'Glucose', 'BUN', 'Creatinine', 'Magnesium', 'Calcium', 'Ionised_Ca', 'CO2_mEqL', 'SGOT', 'SGPT', 'Total_bili', 'Direct_bili', 'Total_protein', 'Albumin', 'Troponin', 'CRP', 'Hb', 'Ht', 'RBC_count', 'WBC_count', 'Platelets_count', 'PTT', 'PT', 'ACT', 'INR', 'Arterial_pH

In [4]:
# Create 'action' column
rrt_actions = (~MIMICtable[rrt_cols].isna() & (MIMICtable[rrt_cols] != 0)).any(axis=1)
MIMICtable['action'] = rrt_actions.astype(int)

# Actions array
actions = MIMICtable['action'].values

np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
icu_stays = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/icustays.csv')
icd_diagnoses = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/d_icd_diagnoses.csv')
diagnose_icd = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/diagnoses_icd.csv')
exclude_idc = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/exclusion.csv')

exclude_list = exclude_idc['icd_code'].to_list()

merged_df = MIMICtable.merge(icu_stays[['stay_id', 'subject_id']], left_on='icustayid', right_on='stay_id', how='left')

# Filter rows where icd_code is in exclude_list
excluded_rows = diagnose_icd[diagnose_icd['icd_code'].isin(exclude_list)]
# Get the unique hadm_id values from these rows
excluded_subject_ids = excluded_rows['subject_id'].unique().tolist()

print('Number of Patients with kidney issues. ', len(excluded_subject_ids))

print('Number of patients before exclusion:', len(merged_df['icustayid'].unique()))

print('Number of patients with RRT before exclusion:' , len(merged_df[merged_df['action']>0]['icustayid'].unique()))

merged_df = merged_df[~merged_df['subject_id'].isin(excluded_subject_ids)]

print('Number of patients after exclusion:', len(merged_df['icustayid'].unique()))

print('Number of patients with RRT after exclusion:' , len(merged_df[merged_df['action']>0]['icustayid'].unique()))

merged_df.drop(columns=['stay_id', 'subject_id'], inplace=True)

MIMICtable = merged_df

Number of Patients with kidney issues.  5055
Number of patients before exclusion: 59851
Number of patients with RRT before exclusion: 4002
Number of patients after exclusion: 54859
Number of patients with RRT after exclusion: 2055


In [7]:
MIMICtable

Unnamed: 0,bloc,icustayid,timestep,gender,age,elixhauser,re_admission,died_in_hosp,died_within_48h_of_out_time,morta_90,delay_end_of_record_and_discharge_or_death,Height_cm,Weight_kg,GCS,RASS,HR,SysBP,MeanBP,DiaBP,RR,SpO2,Temp_C,Temp_F,CVP,PAPsys,PAPmean,PAPdia,CI,SVR,Interface,FiO2_100,FiO2_1,O2flow,PEEP,TidalVolume,MinuteVentil,PAWmean,PAWpeak,PAWplateau,Respiratory_Rate,Ultrafiltrate_Output,Blood_Flow,Hourly_Patient_Fluid_Removal,Dialysate_Rate,APACHEII_Renal_Failure,Hemodialysis_Output,Citrate,Prefilter_Replacement_Rate,Postfilter_Replacement_Rate,Potassium,Sodium,Chloride,Glucose,BUN,Creatinine,Magnesium,Calcium,Ionised_Ca,CO2_mEqL,SGOT,SGPT,Total_bili,Direct_bili,Total_protein,Albumin,Troponin,CRP,Hb,Ht,RBC_count,WBC_count,Platelets_count,PTT,PT,ACT,INR,Arterial_pH,paO2,paCO2,Arterial_BE,Arterial_lactate,HCO3,ETCO2,SvO2,Anion_Gap,Ammonia,Fibrinogen,Absolute_Neutrophil_Count,Phosphorous,SaO2,Triglyceride,ScvO2,LDH,CK_MB,BNP,Iron,Thyroid_Stimulating_Hormone,Creatinine_Urine,Potassium_Urine,Sodium_Urine,Urea_Nitrogen_Urine,Creatinine_Clearance,T3,Gamma_Glutamyltransferase,Myoglobin,Heparin_LMW,Osmolality_Urine,Insulin,mechvent,extubated,input_total,input_step,output_total,output_step,cumulated_balance,median_dose_vaso,max_dose_vaso,Shock_Index,PaO2_FiO2,SOFA,SIRS,action
0,1,30000153,6461579640,0,61,1,False,0,,0,260.167,,70.0,15.0,2.2,60.4000,119.0000,79.0000,59.0000,14.2000,99.4000,36.9555,98.0600,12.8000,,,,,,0.0,24.0,0.24,4.00000,5.0,500.00,6.660,6.40,11.00,10.6000,22.0000,,,,,,,,,,4.00000,138.0,101.0,110.800,41.0000,3.900000,2.30000,10.0,1.10000,21.0,319.0000,363.0000,13.4,5.40000,,3.6,0.160,,9.67296,28.9000,3.22,18.00000,91.0,28.9000,13.9000,,1.30,7.360,100.0,38.0,-4.0,1.50000,21.0,,,16.0000,,167.0,,3.80000,,,,194.0,10.0,,,,,,,,,,,,,,,0,,5136.69,0.0,2180.0,0.0,2956.69,0.0,0.0,0.507563,416.6670,9,1,0
1,1,30000484,5239603260,0,92,4,True,0,False,1,180.000,163.0,68.5,15.0,0.0,77.6667,124.6000,76.8000,52.9000,12.5000,94.0000,37.0000,98.6000,13.7143,,,,,,2.0,32.0,0.32,3.00000,10.0,1053.00,11.700,12.00,16.00,22.0000,19.7333,,,,,,,,,,4.00000,140.0,105.0,104.000,17.0000,0.600000,2.30000,8.6,1.13000,34.0,44.0000,37.0000,0.4,0.10216,,3.0,0.090,,11.30000,34.2000,3.60,14.20000,34.0,94.1846,14.8000,,1.30,7.366,136.8,36.8,-3.0,1.02000,22.0,,,13.0000,,178.0,,2.90000,,,,819.0,2.0,,,,,,,,,,,,,,,0,,0.00,0.0,0.0,0.0,0.00,0.0,0.0,0.623328,427.5000,3,1,0
2,2,30000484,5239617660,0,92,4,True,0,False,1,180.000,163.0,68.5,14.0,-1.0,99.6364,99.3000,74.6000,62.2500,18.5455,100.0000,35.5556,96.0000,10.7143,,,,,,2.0,36.0,0.36,4.00000,5.0,412.25,8.100,7.00,10.00,16.6667,14.0000,,,,,,,,,,4.00000,145.0,100.0,82.000,13.0000,0.400000,1.80000,8.5,1.07000,28.0,17.0000,136.0000,1.6,0.93424,,2.5,0.180,56.5,10.20000,32.5000,2.67,10.40000,92.0,70.8667,14.8000,,1.30,7.530,19.0,57.0,-3.0,1.20000,25.0,,,9.0000,,520.0,,2.60000,,,,226.0,1.0,,,,,,,,,,,,,,,0,,250.00,250.0,360.0,360.0,-110.00,0.0,0.0,1.003390,52.7778,8,2,0
3,3,30000484,5239632060,0,92,4,True,0,False,1,180.000,163.0,68.5,15.0,0.0,86.0000,99.7143,63.1429,44.8571,15.0000,99.7143,35.5556,96.0000,7.7500,,,,,,2.0,36.0,0.36,4.00000,5.0,450.00,7.925,9.25,25.25,20.0000,16.0000,,,,,,,,,,4.30000,140.0,106.0,188.000,44.0000,1.800000,2.00000,7.3,1.10000,43.0,26.0000,379.0000,1.1,0.58754,,2.5,0.180,56.5,8.80000,42.5000,2.94,26.90000,128.0,35.4000,14.8000,,1.30,7.500,21.0,59.0,13.0,2.00000,21.0,,,17.0000,,127.5,,3.30000,,,,396.0,2.0,,,,,,,,,,,,,,,0,,750.00,500.0,460.0,100.0,290.00,0.0,0.0,0.862464,58.3333,8,2,0
4,4,30000484,5239646460,0,92,4,True,0,False,1,180.000,163.0,68.5,15.0,0.0,85.3333,89.6667,58.0000,42.1667,12.8333,99.6667,35.7037,96.2667,8.0000,,,,,,2.0,36.0,0.36,4.00000,5.0,635.00,8.500,6.00,11.00,24.8750,12.0000,,,,,,,,,,5.40000,136.0,104.0,94.000,47.0000,1.200000,2.30000,7.8,1.15000,29.0,50.0000,32.3333,0.3,0.03282,,2.5,0.205,56.5,8.10000,24.6000,2.80,24.20000,357.0,36.1000,15.5000,,1.35,7.470,21.0,59.0,12.0,1.93333,27.0,,,10.0000,,842.0,,1.90000,,,,419.0,75.0,,,,,,,,,,,,,,,0,,1250.00,500.0,570.0,110.0,680.00,0.0,0.0,0.951672,58.3333,7,2,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1231867,20,39999858,6226980480,0,62,4,False,0,,0,248.083,,123.8,15.0,0.0,72.5714,116.5710,72.1429,49.9286,32.5714,91.0000,36.8889,98.4000,,,,,,,2.0,40.0,0.40,7.14286,4.0,350.00,6.400,5.00,14.00,27.6000,18.0000,,,,,,,,,,3.80000,136.0,100.0,222.000,17.0000,0.600000,1.90000,8.9,1.20000,28.0,40.0000,45.0000,0.8,0.37952,,3.3,0.630,,12.00000,36.7000,4.02,8.00000,185.0,33.2000,13.8000,,1.30,7.470,178.3,66.0,5.6,0.90000,21.0,38.0,,10.0000,,,,3.80000,,,,433.0,4.0,,,,,,,,,,,,0.21,,,0,,580.00,250.0,7525.0,900.0,-6945.00,0.0,0.0,0.622551,445.7500,0,1,0
1231868,21,39999858,6226994880,0,62,4,False,0,,0,248.083,,123.8,15.0,0.0,66.2000,112.2500,69.5000,48.1250,27.6000,92.0000,36.8222,98.2800,,,,,,,2.0,40.0,0.40,6.00000,4.0,350.00,6.400,5.00,14.00,20.0000,18.0000,,,,,,,,,,3.80000,136.0,100.0,191.000,17.0000,0.600000,1.90000,8.9,1.20000,28.0,40.0000,45.0000,0.8,0.37952,,3.3,0.630,,12.00000,36.7000,4.02,8.00000,185.0,33.2000,13.8000,,1.30,7.470,178.3,66.0,5.6,0.90000,26.0,38.0,,10.0000,,,,3.80000,,,,433.0,3.0,,,,,,,,,,,,,,,0,,580.00,0.0,7875.0,350.0,-7295.00,0.0,0.0,0.589755,445.7500,1,1,0
1231869,22,39999858,6227009280,0,62,4,False,0,,0,248.083,,123.8,15.0,0.0,55.8571,110.7140,69.2857,48.5714,20.7143,94.2857,36.7461,98.1429,,,,,,,2.0,40.0,0.40,6.00000,4.0,350.00,6.400,5.00,14.00,21.5294,18.0000,,,,,,,,,,3.80000,136.0,100.0,168.857,17.0000,0.600000,1.90000,8.9,1.20000,28.0,40.0000,45.0000,0.8,0.37952,,3.3,0.630,,12.00000,36.7000,4.02,8.00000,185.0,33.2000,13.8000,,1.30,7.370,178.3,66.0,5.6,0.90000,26.0,38.0,,10.0000,,,,3.80000,,,,433.0,3.0,,,,,,,,,,,,,,,0,,580.00,0.0,7875.0,0.0,-7295.00,0.0,0.0,0.504517,445.7500,1,1,0
1231870,23,39999858,6227023680,0,62,4,False,0,,0,248.083,,123.8,15.0,0.0,62.1667,117.4000,79.0000,59.8000,24.1667,93.3333,36.7223,98.1000,,,,,,,2.0,44.0,0.44,6.00000,4.0,350.00,6.400,5.00,14.00,19.0000,18.0000,,,,,,,,,,3.73333,134.0,97.0,148.000,17.6667,0.633333,1.86667,8.9,1.04000,28.0,38.3333,45.0000,0.8,0.37952,,3.3,0.010,,12.70000,37.2667,4.02,8.16667,195.0,31.4000,13.9667,,1.30,7.460,178.3,66.0,5.6,0.90000,26.0,43.0,,10.3333,,,,3.63333,,,,420.0,5.0,,,,,,,,,,,,,,,0,,580.00,0.0,7875.0,0.0,-7295.00,0.0,0.0,0.529529,405.2270,0,1,0


In [8]:


import pandas as pd

# Assuming MIMICtable is your original DataFrame

# 1. Store the original data types
original_dtypes = MIMICtable.dtypes.to_dict()

# 2. Create a 'day' column by dividing 'bloc' by 3 (ensure integer division if needed)
MIMICtable['day'] = MIMICtable['bloc'] // 3

# 3. Define the columns for different aggregation functions
sum_cols = [
    'input_total', 'input_step', 'output_total', 'output_step',
    'cumulated_balance', 'median_dose_vaso', 'max_dose_vaso'
]

max_cols = ['mechvent', 'extubated', 'action']

first_cols = ['gender', 'age', 'elixhauser', 're_admission', 'Height_cm', 'Weight_kg']

# 4. Identify columns to average (ensure they are numeric)
excluded_cols = set(sum_cols + max_cols + first_cols + ['icustayid', 'timestep', 'bloc', 'day'])
mean_cols = [col for col in MIMICtable.columns if col not in excluded_cols]

# 5. Create the aggregation dictionary
agg_dict = {col: 'sum' for col in sum_cols}
agg_dict.update({col: 'max' for col in max_cols})
agg_dict.update({col: 'first' for col in first_cols})
agg_dict.update({col: 'mean' for col in mean_cols})

# 6. Perform the groupby aggregation
MIMICtable_agg = MIMICtable.groupby(['icustayid', 'day']).agg(agg_dict).reset_index()

# 7. Restore the original data types
for col in MIMICtable_agg.columns:
    if col in original_dtypes:
        original_dtype = original_dtypes[col]
        try:
            # Handle integer division results if necessary
            if pd.api.types.is_integer_dtype(original_dtype):
                MIMICtable_agg[col] = MIMICtable_agg[col].round().astype(original_dtype)
            else:
                MIMICtable_agg[col] = MIMICtable_agg[col].astype(original_dtype)
        except (ValueError, TypeError):
            # If casting fails, you can choose to keep the aggregated type or handle it differently
            print(f"Warning: Could not convert column '{col}' to {original_dtype}. Keeping the aggregated type.")

# Optional: Verify that data types are preserved
print(MIMICtable_agg.dtypes)
MIMICtable_agg['bloc'] = MIMICtable_agg['day'] +1
MIMICtable_agg.drop(columns=['day'], inplace=True)

MIMICtable = MIMICtable_agg


icustayid         int64
day               int64
input_total     float64
input_step      float64
output_total    float64
                 ...   
Insulin         float64
Shock_Index     float64
PaO2_FiO2       float64
SOFA              int64
SIRS              int64
Length: 121, dtype: object


  MIMICtable_agg['bloc'] = MIMICtable_agg['day'] +1


In [9]:
MIMICtable

Unnamed: 0,icustayid,input_total,input_step,output_total,output_step,cumulated_balance,median_dose_vaso,max_dose_vaso,mechvent,extubated,action,gender,age,elixhauser,re_admission,Height_cm,Weight_kg,died_in_hosp,died_within_48h_of_out_time,morta_90,delay_end_of_record_and_discharge_or_death,GCS,RASS,HR,SysBP,MeanBP,DiaBP,RR,SpO2,Temp_C,Temp_F,CVP,PAPsys,PAPmean,PAPdia,CI,SVR,Interface,FiO2_100,FiO2_1,O2flow,PEEP,TidalVolume,MinuteVentil,PAWmean,PAWpeak,PAWplateau,Respiratory_Rate,Ultrafiltrate_Output,Blood_Flow,Hourly_Patient_Fluid_Removal,Dialysate_Rate,APACHEII_Renal_Failure,Hemodialysis_Output,Citrate,Prefilter_Replacement_Rate,Postfilter_Replacement_Rate,Potassium,Sodium,Chloride,Glucose,BUN,Creatinine,Magnesium,Calcium,Ionised_Ca,CO2_mEqL,SGOT,SGPT,Total_bili,Direct_bili,Total_protein,Albumin,Troponin,CRP,Hb,Ht,RBC_count,WBC_count,Platelets_count,PTT,PT,ACT,INR,Arterial_pH,paO2,paCO2,Arterial_BE,Arterial_lactate,HCO3,ETCO2,SvO2,Anion_Gap,Ammonia,Fibrinogen,Absolute_Neutrophil_Count,Phosphorous,SaO2,Triglyceride,ScvO2,LDH,CK_MB,BNP,Iron,Thyroid_Stimulating_Hormone,Creatinine_Urine,Potassium_Urine,Sodium_Urine,Urea_Nitrogen_Urine,Creatinine_Clearance,T3,Gamma_Glutamyltransferase,Myoglobin,Heparin_LMW,Osmolality_Urine,Insulin,Shock_Index,PaO2_FiO2,SOFA,SIRS,bloc
0,30000153,5136.69,0.00,2180.0,0.0,2956.69,0.000,0.000,0,,0,0,61,1,False,,70.0,0,,0,260.167,15.000000,2.200000,60.400000,119.000000,79.000000,59.000000,14.200000,99.400000,36.955500,98.060000,12.800000,,,,,,0.0,24.000000,0.240000,4.000000,5.000000,500.000000,6.660000,6.400000,11.000000,10.600000,22.000000,,,,,,,,,,4.000000,138.000000,101.000000,110.800000,41.000000,3.900000,2.30000,10.000000,1.100000,21.000000,319.000000,363.000000,13.400000,5.400000,,3.60,0.160000,,9.672960,28.900000,3.220000,18.000000,91.000000,28.900000,13.900000,,1.30000,7.360000,100.000000,38.000000,-4.000000,1.500000,21.000000,,,16.000000,,167.000000,,3.800000,,,,194.000000,10.000000,,,,,,,,,,,,,,,0.507563,416.667000,9,1,1
1,30000484,250.00,250.00,360.0,360.0,-110.00,0.000,0.000,0,,0,0,92,4,True,163.0,68.5,0,0.0,1,180.000,14.500000,-0.500000,88.651550,111.950000,75.700000,57.575000,15.522750,97.000000,36.277800,97.300000,12.214300,,,,,,2.0,34.000000,0.340000,3.500000,7.500000,732.625000,9.900000,9.500000,13.000000,19.333350,16.866650,,,,,,,,,,4.000000,142.500000,102.500000,93.000000,15.000000,0.500000,2.05000,8.550000,1.100000,31.000000,30.500000,86.500000,1.000000,0.518200,,2.75,0.135000,56.5,10.750000,33.350000,3.135000,12.300000,63.000000,82.525650,14.800000,,1.30000,7.448000,77.900000,46.900000,-3.000000,1.110000,23.500000,,,11.000000,,349.000000,,2.750000,,,,522.500000,1.500000,,,,,,,,,,,,,,,0.813359,240.138900,6,2,1
2,30000484,3754.85,1504.85,1710.0,320.0,2044.85,0.000,0.000,0,,0,0,92,4,True,163.0,68.5,0,0.0,1,180.000,15.000000,0.000000,86.027767,99.210333,60.130967,40.591267,14.277767,99.793667,35.688300,96.238900,7.916667,,,,,,2.0,36.000000,0.360000,4.000000,5.000000,495.000000,7.275000,8.666667,24.333333,25.041667,13.333333,,,,,,,,,,5.033333,137.333333,104.666667,125.333333,46.000000,1.400000,2.20000,7.633333,1.133333,33.666667,42.000000,147.888867,0.566667,0.217727,,2.50,0.205000,56.5,8.333333,30.566667,2.748890,25.100000,280.666667,35.866667,15.500000,,1.35000,7.480000,21.000000,59.000000,10.000000,1.844443,25.000000,,,12.333333,,438.500000,,2.366667,,,,411.333333,26.000000,,,,,,,,,,,,,,,0.871841,58.333300,7,2,2
3,30000484,6488.55,678.00,3440.0,620.0,3048.55,0.125,0.150,0,,0,0,92,4,True,163.0,68.5,0,0.0,1,180.000,15.000000,0.000000,92.033333,116.289000,67.250000,46.063900,15.327767,99.800000,36.390700,97.503333,6.666667,,,,,,2.0,36.000000,0.360000,4.000000,5.000000,412.166667,7.337500,8.000000,20.916667,20.416667,15.166667,,,,,,,,,,5.233333,136.000000,104.000000,94.000000,47.000000,1.200000,2.30000,7.800000,1.216667,29.000000,50.000000,32.333300,0.300000,0.032820,,2.50,0.230000,56.5,8.100000,24.600000,3.082223,24.200000,357.000000,36.100000,16.200000,,1.40000,7.456667,21.000000,59.000000,2.333333,1.533333,27.000000,,,10.000000,,676.666667,,1.900000,,,,419.000000,11.000000,,,,,,,,,,,,,,,0.794110,58.333300,6,2,3
4,30000484,7583.55,95.00,5230.0,610.0,2353.55,0.048,0.048,0,,0,0,92,4,True,163.0,68.5,0,0.0,1,180.000,14.714300,-0.285714,96.530567,126.625000,82.905567,61.045833,20.269433,100.000000,36.608300,97.895000,7.533333,,,,,,2.0,36.000000,0.360000,4.000000,6.000000,437.733333,8.647777,7.666667,12.622233,16.000000,19.000000,,,,,,,,,,4.811110,137.333333,104.666667,108.074000,43.666667,1.255557,2.30000,8.077777,1.113333,31.000000,38.333333,20.666667,0.355556,0.071342,5.2,2.70,0.213333,56.5,8.700000,25.822233,3.313333,21.533333,343.666667,39.155567,16.088900,,1.40000,7.463333,136.000000,37.000000,4.333333,1.766667,25.666667,,,10.000000,,842.000000,,2.011110,,,,308.444333,7.666667,,,,,,,,,,,,,,,0.762922,377.778000,2,3,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
418142,39999858,330.00,290.00,12550.0,1900.0,-12220.00,0.000,0.000,0,,0,0,62,4,False,,123.8,0,,0,248.083,15.000000,0.000000,69.842867,115.781000,76.980967,57.580967,22.314267,95.095233,36.994733,98.590500,,,,,,,2.0,54.261900,0.542619,40.000000,8.098033,359.533333,7.354510,10.768633,19.000000,20.000000,20.000000,,,,,,,,,,3.700000,138.000000,101.000000,158.676333,19.000000,0.600000,1.90000,9.100000,1.200000,28.000000,41.000000,47.000000,0.500000,0.171500,,3.30,0.216667,,12.400000,37.600000,4.150000,7.100000,170.000000,28.700000,12.900000,,1.20000,7.426667,190.100000,50.000000,3.866667,2.233333,25.333333,42.333333,,12.000000,,,,4.000000,,,,373.000000,4.333333,,,,,,,,,,,,0.21,,,0.603307,356.835333,1,1,5
418143,39999858,870.00,0.00,17025.0,675.0,-16155.00,0.000,0.000,0,,0,0,62,4,False,,123.8,0,,0,248.083,14.689467,-0.310541,61.084433,115.248667,75.798733,56.073800,23.205133,92.512833,36.805400,98.249667,,,,,,,2.0,39.102567,0.391026,32.424233,8.000000,328.600000,8.840000,12.600000,19.000000,17.676467,25.000000,,,,,,,,,,3.711110,137.333333,100.666667,203.889000,18.777767,0.600000,1.90000,9.077777,1.168890,28.000000,40.888900,46.777767,0.533333,0.194613,,3.30,0.630000,,12.266667,37.500000,4.135557,7.200000,171.666667,29.200000,13.000000,,1.21111,7.456667,163.866667,55.333333,2.400000,0.900000,25.333333,44.000000,,11.777767,,,,3.977777,,,,379.666667,3.000000,,,,,,,,,,,,0.21,,,0.533346,420.389000,1,1,6
418144,39999858,1200.00,290.00,20075.0,1600.0,-18875.00,0.000,0.000,0,,0,0,62,4,False,,123.8,0,,0,248.083,15.000000,0.000000,72.248667,118.121333,75.365100,53.986800,27.571433,92.904767,36.866833,98.360333,,,,,,,2.0,40.000000,0.400000,19.285720,7.000000,372.333333,6.641177,9.066667,16.254900,23.552933,18.000000,,,,,,,,,,3.800000,136.000000,100.000000,175.767000,17.000000,0.600000,1.90000,8.900000,1.161667,28.666667,40.000000,45.000000,0.800000,0.379520,,3.30,0.423333,,12.000000,36.700000,4.020000,8.000000,185.000000,33.200000,13.800000,,1.30000,7.470000,149.533333,57.000000,3.733333,1.100000,24.333333,38.000000,,10.000000,,,,3.800000,,,,433.000000,4.000000,,,,,,,,,,,,0.21,,,0.612765,373.833333,1,1,7
418145,39999858,1740.00,0.00,23625.0,350.0,-21885.00,0.000,0.000,0,,0,0,62,4,False,,123.8,0,,0,248.083,15.000000,0.000000,61.407933,113.454667,72.595233,52.165467,24.160333,93.206333,36.763533,98.174300,,,,,,,2.0,41.333333,0.413333,6.000000,4.000000,350.000000,6.400000,5.000000,14.000000,20.176467,18.000000,,,,,,,,,,3.777777,135.333333,99.000000,169.285667,17.222233,0.611111,1.88889,8.900000,1.146667,28.000000,39.444433,45.000000,0.800000,0.379520,,3.30,0.423333,,12.233333,36.888900,4.020000,8.055557,188.333333,32.600000,13.855567,,1.30000,7.433333,178.300000,66.000000,5.600000,0.900000,26.000000,39.666667,,10.111100,,,,3.744443,,,,428.666667,3.666667,,,,,,,,,,,,,,,0.541267,432.242333,1,1,8


In [10]:
patient_day_counts = MIMICtable.groupby('icustayid')['bloc'].nunique().reset_index()
patient_day_counts.rename(columns={'bloc': 'num_blocs'}, inplace=True)

patients_with_2_days = patient_day_counts[patient_day_counts['num_blocs'] >= 2]['icustayid']
# Step 5: Filter the aggregated data
MIMICtable = MIMICtable[MIMICtable['icustayid'].isin(patients_with_2_days)].reset_index(drop=True)


In [11]:
MIMICtable.to_parquet(os.path.join(in_dir, "MIMIC_action.parquet"))

In [12]:
comorb = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/comorbidities.csv')

In [13]:
feature_name_mapping = {
    'output_step': '12-hour total output, mL',
    'SOFA': 'SOFA score',
    'cumulated_balance': 'Cumulative balance, mL',
    'Creatinine': 'Creatinine, mg/dL',
    'Platelets_count': 'Platelet count, ×10^3/µL',
    'Chloride': 'Chloride, mEq/L',
    'BUN': 'BUN, mg/dL',
    'Anion_Gap': 'Anion gap, mEq/L',
    'Calcium': 'Calcium, mg/dL',
    'input_total': 'Total input, mL',
    'WBC_count': 'WBC count, ×10^3/µL',
    'Total_bili': 'Total bilirubin, mg/dL',
    'Phosphorous': 'Phosphorus, mg/dL',
    'O2flow': 'O2 flow, L/min',
    'output_total': 'Total output, mL',
    'Weight_kg': 'Weight, kg',
    'RASS': 'RASS score',
    'Sodium': 'Sodium, mEq/L',
    'Temp_C': 'Temperature, °C',
    'age': 'Age, years',
    'max_dose_vaso': 'Maximum vasopressor dose, µg/kg/min',
    'PAWmean': 'Mean airway pressure, cmH2O',
    'GCS': 'GCS score',
    'SGOT': 'AST (SGOT), U/L',
    'PT': 'PT, s',
    'PTT': 'PTT, s',
    'RBC_count': 'RBC count, ×10^6/µL',
    'LDH': 'LDH, U/L',
    'Ht': 'Hematocrit, %',
    'RR': 'Respiratory rate, breaths/min',
    'HCO3': 'Bicarbonate, mEq/L',
    'SpO2': 'SpO2, %',
    'Ionised_Ca': 'Ionized calcium, mmol/L',
    'Hb': 'Hemoglobin, g/dL',
    'FiO2_1': 'FiO2, %',
    'SGPT': 'ALT (SGPT), U/L',
    'Shock_Index': 'Shock index',
    'Glucose': 'Glucose, mg/dL',
    'HR': 'Heart rate, beats/min',
    'MinuteVentil': 'Minute ventilation, L/min',
    'MeanBP': 'Mean blood pressure, mmHg',
    'INR': 'INR',
    'Potassium': 'Potassium, mEq/L',
    'Fibrinogen': 'Fibrinogen, mg/dL',
    'Arterial_pH': 'Arterial pH',
    'PaO2_FiO2': 'PaO2/FiO2 ratio',
    'TidalVolume': 'Tidal volume, mL',
    'paO2': 'PaO2, mmHg',
    'Albumin': 'Albumin, g/dL',
    'DiaBP': 'Diastolic blood pressure, mmHg',
    'input_step': '12-hour total input, mL',
    'Magnesium': 'Magnesium, mg/dL',
    'SysBP': 'Systolic blood pressure, mmHg',
    'PAWpeak': 'Peak airway pressure, cmH2O',
    'extubated': 'Extubated (yes/no)',
    'Arterial_BE': 'Arterial base excess, mEq/L',
    'PAWplateau': 'Plateau airway pressure, cmH2O',
    'Height_cm': 'Height, cm',
    'CVP': 'cCntral venous pressure, mmHg',
    'paCO2': 'PaCO2, mmHg',
    'Arterial_lactate': 'Arterial lactate, mmol/L',
    'PEEP': 'PEEP, cmH2O',
    'CK_MB': 'CK-MB, ng/mL',
    'ETCO2': 'End-tidal CO2, mmHg',
    'Troponin': 'Troponin, ng/mL',
    'mechvent': 'Mechanical ventilation (yes/no)',
    'Absolute_Neutrophil_Count': 'Absolute neutrophil count, ×10^3/µL',
    'SIRS': 'SIRS criteria',
    'SaO2': 'SaO2, %',
    'Triglyceride': 'Triglycerides, mg/dL',
    'SvO2': 'SvO2, %',
    'PAPsys': 'Pulmonary artery systolic pressure, mmHg',
    'PAPdia': 'Pulmonary artery diastolic pressure, mmHg',
    're_admission': 're-admission (yes/no)',
    'PAPmean': 'Mean pulmonary artery pressure, mmHg',
    'Creatinine_Urine': 'Urine creatinine, mg/dL',
    'gender': 'gender (M/F)',
    'BNP': 'BNP, pg/mL',
    'CRP': 'CRP, mg/L',
    'Urea_Nitrogen_Urine': 'Urine urea nitrogen, mg/dL',
    'Sodium_Urine': 'Urine sodium, mEq/L',
    'Potassium_Urine': 'Urine potassium, mEq/L',
    'Iron': 'Iron, µg/dL',
    'Ammonia': 'Ammonia, µg/dL',
    'Thyroid_Stimulating_Hormone': 'TSH, mIU/L',
    'Total_protein': 'Total protein, g/dL',
    'CI': 'Cardiac index, L/min/m²',
    'ACT': 'ACT, s',
    'T3': 'T3, ng/dL',
    'Gamma_Glutamyltransferase': 'GGT, U/L',
    'Heparin_LMW': 'Low molecular weight heparin (yes/no)',
    'APACHEII_Renal_Failure': 'APACHE II renal failure score',
    'Osmolality_Urine': 'Urine osmolality, mOsm/kg'
}

In [14]:
comorb.columns

Index(['subject_id', 'hadm_id', 'icustayid', 'congestive_heart_failure',
       'cardiac_arrhythmias', 'valvular_disease', 'pulmonary_circulation',
       'peripheral_vascular', 'hypertension', 'paralysis',
       'other_neurological', 'chronic_pulmonary', 'diabetes_uncomplicated',
       'diabetes_complicated', 'hypothyroidism', 'renal_failure',
       'liver_disease', 'peptic_ulcer', 'aids', 'lymphoma',
       'metastatic_cancer', 'solid_tumor', 'rheumatoid_arthritis',
       'coagulopathy', 'obesity', 'weight_loss', 'fluid_electrolyte',
       'blood_loss_anemia', 'deficiency_anemias', 'alcohol_abuse',
       'drug_abuse', 'psychoses', 'depression'],
      dtype='object')

In [15]:
only_AI =  pd.read_csv('/home/lkapral/RRT_mimic_iv/data/model/mimic_clclinician_no_rrt_ai_rrt.csv')

only_clin = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/model/mimic_clinician_rrt_ai_no_rrt.csv')

no_RRT = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/model/mimic_clclinician_no_rrt_ai_no_rrt.csv')

RRT = pd.read_csv('/home/lkapral/RRT_mimic_iv/data/model/mimic_clclinician_rrt_ai_rrt.csv')

In [16]:
import pandas as pd

# Merge the DataFrames on 'icustayid' using a left join
merged_df = pd.merge(MIMICtable, comorb, on='icustayid', how='left')

# Define the cancer-related columns
cancer_columns = ['metastatic_cancer', 'solid_tumor', 'lymphoma']

# Identify which cancer columns are present in the merged DataFrame
existing_cancer_columns = [col for col in cancer_columns if col in merged_df.columns]

if existing_cancer_columns:
    # Create the 'Cancer' column: 1 if any cancer column is 1, else 0
    merged_df['Cancer'] = merged_df[existing_cancer_columns].any(axis=1).astype(int)
    
    # Drop the original cancer-related columns
    merged_df = merged_df.drop(existing_cancer_columns, axis=1)
else:
    # If none of the cancer columns are present, set 'Cancer' to 0
    merged_df['Cancer'] = 0

# Now, merged_df contains the 'Cancer' column with values 0 or 1



In [17]:
diabetes_columns = ['diabetes_uncomplicated', 'diabetes_complicated']
existing_diabetes_columns = [col for col in diabetes_columns if col in merged_df.columns]

if existing_diabetes_columns:
    # Sum the diabetes-related columns
    merged_df['diabetes'] = merged_df[existing_diabetes_columns].sum(axis=1)
    
    # Convert to binary: 1 if sum > 0, else 0
    merged_df['diabetes'] = (merged_df['diabetes'] > 0).astype(int)
    
    # Drop the original diabetes-related columns
    merged_df = merged_df.drop(existing_diabetes_columns, axis=1)
else:
    merged_df['diabetes'] = 0


In [18]:
# Define groups based on ICUstayid
all_ids = merged_df['icustayid'].unique()
RRT_ids = RRT['icustayid'].unique()
no_RRT_ids = no_RRT['icustayid'].unique()
only_clin_ids = only_clin['icustayid'].unique()
only_AI_ids = only_AI['icustayid'].unique()

# Create a dictionary for easy access with the desired order
groups = {
    'All': all_ids,
    'Both RRT': RRT_ids,
    'Neither RRT': no_RRT_ids,
    'Clinician-Only RRT': only_clin_ids,
    'AI-Only RRT': only_AI_ids
}


In [19]:
len(RRT_ids)

422

In [20]:
def calculate_unique_icus(df):
    return df['icustayid'].nunique()

def calculate_unique_admissions(df):
    return df['subject_id'].nunique()  # Assuming 'subject_id' represents unique admissions

def calculate_age_stats(df):
    df_unique = df.drop_duplicates(subset='icustayid')
    mean_age = df_unique['age'].mean()
    std_age = df_unique['age'].std()
    return f"{mean_age:.1f} ({std_age:.1f})" if not np.isnan(mean_age) else "N/A"

def calculate_female_gender_percentage(df):
    # Assuming 0 = Male, 1 = Female
    
    df_unique = df.drop_duplicates(subset='icustayid')
    total = len(df_unique)
    female_count = df_unique['gender'].sum()  # Since 'gender' is 1 for female
    female_pct = (female_count / total * 100) if total > 0 else 0
    return f"{int(female_count)} ({female_pct:.1f}%)"

def calculate_comorbidities(df, comorb_cols):
    # Drop duplicates based on 'icustayid' to ensure each ICU stay is counted once
    df_unique = df.drop_duplicates(subset='icustayid')
    stats = {}
    for col in comorb_cols:
        count = df_unique[col].sum()
        total = len(df_unique)
        # Format: "count (percentage%)"
        stats[col.replace('_', ' ').capitalize()] = f"{int(count)} ({count/total*100:.1f}%)"
    return stats

def calculate_primary_diagnosis(df):
    # 'Renal Replacement Therapy (RRT)' is represented by 'action' column
    # Drop duplicates to ensure each ICU stay is counted once
    df_unique = df.drop_duplicates(subset='icustayid')
    rrt_count = df_unique['action'].sum()
    total = len(df_unique)
    rrt_percentage = (rrt_count / total * 100) if total > 0 else 0
    return {'Renal replacement therapy (RRT)': f"{int(rrt_count)} ({rrt_percentage:.1f}%)"}

def calculate_mean_sofa(df):
    # Drop duplicates to ensure each ICU stay is counted once
    df_unique = df.drop_duplicates(subset='icustayid')
    mean_sofa = df_unique['SOFA'].mean()
    std_sofa = df_unique['SOFA'].std()
    return f"{mean_sofa:.1f} ({std_sofa:.1f})" if not np.isnan(mean_sofa) else "N/A"

def calculate_procedures(df):
    # Drop duplicates to ensure each ICU stay is counted once
    df_unique = df.drop_duplicates(subset='icustayid')
    mech_vent = df_unique['mechvent'].sum()
    vaso = df_unique['median_dose_vaso'].mean()  # Adjust based on actual column indicating vasopressor use
    std_vaso = df_unique['SOFA'].std()
    total = len(df_unique)
    return {
        'Mechanical ventilation': f"{int(mech_vent)} ({(mech_vent/total*100):.1f}%)",
        'Vasopressors': f"{vaso:.1f} ({std_vaso:.1f})" if not np.isnan(vaso) else "N/A",
    }

def calculate_length_of_stay(df):
    # 'BLOC' represents sequential steps (1,2,3,4,5)
    # The last 'BLOC' value per 'icustayid' is the length of stay
    # Calculate the mean of the last 'BLOC' values across the group
    if 'bloc' in df.columns:
        last_bloc = df.sort_values(['icustayid', 'bloc']).groupby('icustayid')['bloc'].max()
        mean_length = last_bloc.mean()/2
        std_length = last_bloc.std()/2
        return f"{mean_length:.1f} ({std_length:.1f})"
    else:
        # Fallback to median and IQR if 'BLOC' is not available
        median = df['delay_end_of_record_and_discharge_or_death'].median()
        q1 = df['delay_end_of_record_and_discharge_or_death'].quantile(0.25)
        q3 = df['delay_end_of_record_and_discharge_or_death'].quantile(0.75)
        return f"{median:.1f} ({q1:.1f}–{q3:.1f})" if not np.isnan(median) else "N/A"

def calculate_mortality(df):
    # Drop duplicates to ensure each ICU stay is counted once
    df_unique = df.drop_duplicates(subset='icustayid')
    print(df_unique)
    icu_mort = df_unique['died_in_hosp'].mean() * 100  # Percentage
    hosp_mort = df_unique['died_within_48h_of_out_time'].mean() * 100  # Percentage
    mort_90 = df_unique['morta_90'].mean() * 100  # Percentage
    return {
        'Hospital mortality': f"{hosp_mort:.1f}%",
        '90-day mortality': f"{mort_90:.1f}%"
    }

def calculate_vitals_labs(df, vitals_labs_cols):
    # Drop duplicates to ensure each ICU stay is counted once
    df_unique = df.drop_duplicates(subset='icustayid')
    stats = {}
    for col in vitals_labs_cols:
        if col in df_unique.columns:
            mean_value = df_unique[col].mean()
            std_value = df_unique[col].std()
            stats[col] = f"{mean_value:.1f} ({std_value:.1f})" if not np.isnan(mean_value) else "N/A"
        else:
            stats[col] = "N/A"  # If the column does not exist
    return stats


In [21]:

# List of comorbidity columns based on 'comorb' excluding the ones to remove
comorb_cols = [
    'congestive_heart_failure',
    'hypertension',
    'chronic_pulmonary',
    'diabetes',
    'renal_failure',
    'liver_disease',
    'Cancer', 
    'coagulopathy',
    'obesity',
    'fluid_electrolyte',
    'alcohol_abuse',
    'aids'
]

# Vital signs and lab parameters to include
vitals_labs_cols = [
    'HR',
    'SysBP',
    'MeanBP',
    'RR',
    'Temp_C',
    'output_step',
    'RASS',
    'GCS',
    'Anion_Gap',
    'Chloride',
    'cumulated_balance',
    'Ht',
    'Total_bili',
    'Phosphorous',
    'Creatinine',
    'BUN',
    'Hb',
    'WBC_count',
    'Platelets_count',
    'HCO3'
]

# Initialize a dictionary to hold summary data with features as keys
summary_data = {}

# Initialize a list of all features to include in the summary
features = [
    'Unique ICUs (n)',
    'Unique ICU admissions (n)',
    'Age, years (std)',
    'Female gender (n (%))'
] + [col.replace('_', ' ').capitalize() for col in comorb_cols] + [
    'Initial SOFA (std)',
    'Mechanical ventilation',
    'Vasopressors',
    'Length of stay, days',
    'Hospital mortality',
    '90-day mortality'
] + [feature_name_mapping.get(col, col.replace('_', ' ').capitalize()) for col in vitals_labs_cols]  # Adding vital signs and lab parameters with mapping

# Initialize the summary_data dictionary with empty dicts for each feature
for feature in features:
    summary_data[feature] = {}

# Iterate through each group and calculate statistics
for group_name, ids in groups.items():
    # Select all records for the group without dropping duplicates
    df_group = merged_df[merged_df['icustayid'].isin(ids)]
    
    # Calculate statistics
    unique_icus = calculate_unique_icus(df_group)
    unique_admissions = calculate_unique_admissions(df_group)
    age_stats = calculate_age_stats(df_group)
    female_gender_pct = calculate_female_gender_percentage(df_group)
    comorb_stats = calculate_comorbidities(df_group, comorb_cols)
    sofa_stats = calculate_mean_sofa(df_group)
    procedures = calculate_procedures(df_group)
    length_of_stay = calculate_length_of_stay(df_group)
    mortality = calculate_mortality(df_group)
    vitals_labs_stats = calculate_vitals_labs(df_group, vitals_labs_cols)
    
    # Populate the summary_data dictionary
    summary_data['Unique ICUs (n)'][group_name] = unique_icus
    summary_data['Unique ICU admissions (n)'][group_name] = unique_admissions
    summary_data['Age, years (std)'][group_name] = age_stats
    summary_data['Female gender (n (%))'][group_name] = female_gender_pct
    
    # Comorbidities
    for comorbidity, stat in comorb_stats.items():
        # Map comorbidity names to feature names (assuming comorbidity names are already formatted)
        feature_name = comorbidity  # Already capitalized and spaced
        summary_data[feature_name][group_name] = stat
    
    # SOFA Score
    summary_data['Initial SOFA (std)'][group_name] = sofa_stats
    
    # Procedures
    for proc, stat in procedures.items():
        summary_data[proc][group_name] = stat
    
    # Length of Stay
    summary_data['Length of stay, days'][group_name] = length_of_stay
    
    # Mortality
    for mort_type, stat in mortality.items():
        summary_data[mort_type][group_name] = stat
    
    # Vital signs and lab parameters
    for param, stat in vitals_labs_stats.items():
        # Map the original param name to the journal-friendly name using feature_name_mapping
        mapped_param = feature_name_mapping.get(param, param.replace('_', ' ').capitalize())
        summary_data[mapped_param][group_name] = stat

# Convert summary_data to a DataFrame
summary_df = pd.DataFrame(summary_data)

# Optional: Reorder columns if necessary
# For example, ensure groups are ordered as desired

# Save the summary table as a CSV
output_path = '/home/lkapral/RRT_mimic_iv/data/model/MIMIC_summary_table.csv'
summary_df.to_csv(output_path, index=True)  # index=True to include feature names as the first column

print(f"Summary table successfully saved to {output_path}")


        icustayid  input_total  input_step  output_total  output_step  \
0        30000484     250.0000    250.0000         360.0        360.0   
6        30000831       0.0000      0.0000        1700.0       1025.0   
13       30001396    4059.5100     59.5110           0.0          0.0   
16       30001446    3005.1800   1581.5600         415.0        275.0   
20       30001471      70.8471     54.4856         400.0        200.0   
...           ...          ...         ...           ...          ...   
417512   39999230   18078.0100   2931.4770         760.0        410.0   
417530   39999384    5025.0000    512.5000         590.0        385.0   
417533   39999562    2676.0600    491.6800           0.0          0.0   
417544   39999810      46.4874     23.2437         525.0        325.0   
417554   39999858       0.0000      0.0000         300.0        300.0   

        cumulated_balance  median_dose_vaso  max_dose_vaso  mechvent  \
0                -110.000            0.0000        

In [22]:
# Convert summary_data to DataFrame with features as rows and groups as columns
summary_df = pd.DataFrame(summary_data).transpose()

# Ensure that all group columns exist and follow the desired order
desired_order = ['All', 'Both RRT', 'Neither RRT', 'Clinician-Only RRT', 'AI-Only RRT']
for group in desired_order:
    if group not in summary_df.columns:
        summary_df[group] = np.nan  # Assign NaN if group is missing

# Reorder the columns to match the desired order
summary_df = summary_df[desired_order]

# Reorder the rows to match the desired feature order
summary_df = summary_df.loc[features]


# Save to CSV
summary_df.to_csv('/home/lkapral/RRT_mimic_iv/data/model/MIMIC_summary_table.csv', index=True)


In [23]:
summary_df.to_excel('/home/lkapral/RRT_mimic_iv/data/model/MIMIC_summary_table.xlsx', index=True)


In [24]:
summary_df

Unnamed: 0,All,Both RRT,Neither RRT,Clinician-Only RRT,AI-Only RRT
Unique ICUs (n),54275,422,15651,171,39
Unique ICU admissions (n),41283,406,14141,170,39
"Age, years (std)",65.5 (16.7),61.7 (15.1),65.8 (16.7),63.6 (16.9),60.5 (12.8)
Female gender (n (%)),24373 (44.9%),159 (37.7%),7102 (45.4%),79 (46.2%),15 (38.5%)
Congestive heart failure,15806 (29.1%),191 (45.3%),4505 (28.8%),90 (52.6%),18 (46.2%)
Hypertension,35269 (65.0%),285 (67.5%),10183 (65.1%),141 (82.5%),25 (64.1%)
Chronic pulmonary,14024 (25.8%),94 (22.3%),4030 (25.7%),42 (24.6%),11 (28.2%)
Diabetes,16408 (30.2%),189 (44.8%),4673 (29.9%),96 (56.1%),16 (41.0%)
Renal failure,10580 (19.5%),223 (52.8%),2878 (18.4%),120 (70.2%),18 (46.2%)
Liver disease,7487 (13.8%),184 (43.6%),1933 (12.4%),38 (22.2%),25 (64.1%)
