In [1]:
import numpy as np
import pandas as pd
import tqdm
import argparse
import os
from ai_clinician.modeling.normalization import DataNormalization
from ai_clinician.preprocessing.utils import load_csv
from ai_clinician.preprocessing.columns import *
from ai_clinician.modeling.columns import *
from scipy.stats import zscore
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

tqdm.tqdm.pandas()

def save_data_files(dir, MIMICraw, MIMICzs, metadata):
    MIMICraw.to_csv(os.path.join(dir, "MIMICraw.csv"), index=False)
    MIMICzs.to_csv(os.path.join(dir, "MIMICzs.csv"), index=False)
    metadata.to_csv(os.path.join(dir, "metadata.csv"), index=False)



In [2]:
main_path = '/home/lkapral/RRT_mimic_iv'

In [3]:
def create_args():
    parser = argparse.ArgumentParser(description=(
        'Generates a train/test split of the MIMIC-IV dataset, and generates files labeled '
        '{train|test}/MIMICraw.npy and {train|test}/MIMICzs.npy.'
    ))
    parser.add_argument('input', type=str,
                        help='Data directory (should contain mimic_dataset.csv and aki_cohort.csv)')
    parser.add_argument('output', type=str,
                        help='Directory in which to output')
    parser.add_argument('--train-size', dest='train_size', type=float, default=0.7,
                        help='Proportion of data to use in training (default 0.7)')
    parser.add_argument('--outcome', dest='outcome_col', type=str, default='died_in_hosp',
                        help='Name of column to use for outcomes (probably "died_in_hosp" [default] or "morta_90")')
    
    # Simulate input arguments as if they were passed from the command line
    simulated_input = [
        main_path+'/data/mimic',    # Replace with your actual input directory
        main_path+'/data/model',   # Replace with your actual output directory
        '--train-size', '0.7',
        '--outcome', 'morta_90'
    ]
    return parser.parse_args(simulated_input)

args = create_args()

in_dir = args.input
out_dir = args.output
if not os.path.exists(out_dir):
    os.mkdir(out_dir)
mdp_data = load_csv(os.path.join(in_dir, "mimic_dataset.csv"))
aki_cohort = load_csv(os.path.join(in_dir, "aki_cohort.csv"))

MIMICtable = mdp_data[mdp_data[C_ICUSTAYID].isin(aki_cohort[C_ICUSTAYID])].reset_index(drop=True)
assert args.outcome_col in MIMICtable.columns, "Outcome column '{}' not found in MIMICtable".format(args.outcome_col)

rrt_cols = [
    'Ultrafiltrate_Output',
    'Blood_Flow',
    'Hourly_Patient_Fluid_Removal',
    'Dialysate_Rate',
    'Hemodialysis_Output',  
    'Citrate',
    'Prefilter_Replacement_Rate',
    'Postfilter_Replacement_Rate'
]


In [4]:
rrt_actions = (~MIMICtable[rrt_cols].isna() & (MIMICtable[rrt_cols] != 0)).any(axis=1)
MIMICtable['action'] = rrt_actions.astype(int)

actions = MIMICtable['action'].values

np.seterr(divide='ignore', invalid='ignore')

{'divide': 'warn', 'over': 'warn', 'under': 'ignore', 'invalid': 'warn'}

In [5]:
pd.set_option('display.max_columns', None)

In [6]:
icu_stays = pd.read_csv(main_path+'/data/icustays.csv')
icd_diagnoses = pd.read_csv(main_path+'/data/d_icd_diagnoses.csv')
diagnose_icd = pd.read_csv(main_path+'/data/diagnoses_icd.csv')
exclude_idc = pd.read_csv(main_path+'/data/exclusion.csv')

exclude_list = exclude_idc['icd_code'].to_list()

merged_df = MIMICtable.merge(icu_stays[['stay_id', 'subject_id']], left_on='icustayid', right_on='stay_id', how='left')

excluded_rows = diagnose_icd[diagnose_icd['icd_code'].isin(exclude_list)]
excluded_subject_ids = excluded_rows['subject_id'].unique().tolist()

print('Number of Patients with kidney issues. ', len(excluded_subject_ids))

print('Number of patients before exclusion:', len(merged_df['icustayid'].unique()))

print('Number of patients with RRT before exclusion:' , len(merged_df[merged_df['action']>0]['icustayid'].unique()))

merged_df = merged_df[~merged_df['subject_id'].isin(excluded_subject_ids)]

print('Number of patients after exclusion:', len(merged_df['icustayid'].unique()))

print('Number of patients with RRT after exclusion:' , len(merged_df[merged_df['action']>0]['icustayid'].unique()))

merged_df.drop(columns=['stay_id', 'subject_id'], inplace=True)

MIMICtable = merged_df

Number of Patients with kidney issues.  5055
Number of patients before exclusion: 59851
Number of patients with RRT before exclusion: 4002
Number of patients after exclusion: 54859
Number of patients with RRT after exclusion: 2055


In [7]:
import pandas as pd

original_dtypes = MIMICtable.dtypes.to_dict()

# 2. Create a 'day' column by dividing 'bloc' by 3 (ensure integer division if needed)
MIMICtable['day'] = MIMICtable['bloc'] // 3

sum_cols = [
    'input_total', 'input_step', 'output_total', 'output_step',
    'cumulated_balance', 'median_dose_vaso', 'max_dose_vaso'
]

max_cols = ['mechvent', 'extubated', 'action']

first_cols = ['gender', 'age', 'elixhauser', 're_admission', 'Height_cm', 'Weight_kg']


excluded_cols = set(sum_cols + max_cols + first_cols + ['icustayid', 'timestep', 'bloc', 'day'])
mean_cols = [col for col in MIMICtable.columns if col not in excluded_cols]


agg_dict = {col: 'sum' for col in sum_cols}
agg_dict.update({col: 'max' for col in max_cols})
agg_dict.update({col: 'first' for col in first_cols})
agg_dict.update({col: 'mean' for col in mean_cols})

MIMICtable_agg = MIMICtable.groupby(['icustayid', 'day']).agg(agg_dict).reset_index()
MIMICtable_agg = MIMICtable_agg.sort_values(by=['icustayid', 'day'])

MIMICtable_agg['input_total'] = MIMICtable_agg.groupby('icustayid')['input_step'].cumsum()
MIMICtable_agg['output_total'] = MIMICtable_agg.groupby('icustayid')['output_step'].cumsum()
MIMICtable_agg['cumulated_balance'] = MIMICtable_agg['input_total'] + MIMICtable_agg['output_total']


for col in MIMICtable_agg.columns:
    if col in original_dtypes:
        original_dtype = original_dtypes[col]
        try:
            if pd.api.types.is_integer_dtype(original_dtype):
                MIMICtable_agg[col] = MIMICtable_agg[col].round().astype(original_dtype)
            else:
                MIMICtable_agg[col] = MIMICtable_agg[col].astype(original_dtype)
        except (ValueError, TypeError):
            print(f"Warning: Could not convert column '{col}' to {original_dtype}. Keeping the aggregated type.")

print(MIMICtable_agg.dtypes)

MIMICtable_agg['bloc'] = MIMICtable_agg['day'] + 1
MIMICtable_agg.drop(columns=['day'], inplace=True)

MIMICtable = MIMICtable_agg

icustayid         int64
day               int64
input_total     float64
input_step      float64
output_total    float64
                 ...   
Insulin         float64
Shock_Index     float64
PaO2_FiO2       float64
SOFA              int64
SIRS              int64
Length: 121, dtype: object


  MIMICtable_agg['bloc'] = MIMICtable_agg['day'] + 1


In [8]:
MIMICtable[MIMICtable['SvO2']>200]['SvO2']

MIMICtable.loc[MIMICtable['Height_cm']>250, 'Height_cm'] = MIMICtable['Height_cm'].mean()

MIMICtable.loc[MIMICtable['Weight_kg']>400, 'Weight_kg'] = MIMICtable['Weight_kg'].mean( )

MIMICtable.loc[MIMICtable['O2flow']>100, 'O2flow'] = 100.

MIMICtable.loc[MIMICtable['SvO2']>100, 'SvO2'] = 100.

MIMICtable.loc[MIMICtable['paO2']<0, 'paO2'] = 0

MIMICtable.loc[MIMICtable['Hourly_Patient_Fluid_Removal']<0, 'Hourly_Patient_Fluid_Removal'] = 0

In [9]:
patient_day_counts = MIMICtable.groupby('icustayid')['bloc'].nunique().reset_index()
patient_day_counts.rename(columns={'bloc': 'num_blocs'}, inplace=True)

patients_with_2_days = patient_day_counts[patient_day_counts['num_blocs'] >= 2]['icustayid']
MIMICtable = MIMICtable[MIMICtable['icustayid'].isin(patients_with_2_days)].reset_index(drop=True)

In [10]:
MIMICtable.to_parquet(os.path.join(in_dir, "MIMIC_action.parquet"))

In [11]:
import pandas as pd
import numpy as np

# Parameters
fixed_num_features = 40


feature_importance = pd.read_csv(main_path+'/data/model/combined_feature_importances.csv')
feature_importance_sorted = feature_importance.sort_values(by='Combined_Average', ascending=False)

top_features = feature_importance_sorted.head(fixed_num_features)
weights = top_features['Combined_Average'].values
normalized_weights = weights / np.linalg.norm(weights)

feature_to_weight = {}
for i, feat in enumerate(top_features['Feature']):
    feature_to_weight[feat] = normalized_weights[i]

all_features = feature_importance['Feature'].tolist()
for feat in all_features:
    if feat not in feature_to_weight:
        feature_to_weight[feat] = 0.0

total_rows = MIMICtable.shape[0]
mean_list = []
std_list = []
missingness_list = []

for feat in feature_importance['Feature']:
    col_data = MIMICtable[feat]
    feat_mean = col_data.mean()
    feat_std = col_data.std()
    missing_count = col_data.isnull().sum()
    # Convert missing count to percentage
    missing_percent = (missing_count / total_rows) * 100

    mean_list.append(feat_mean)
    std_list.append(feat_std)
    missingness_list.append(missing_percent)

final_df = pd.DataFrame({
    'Feature': feature_importance['Feature'],
    'Mean': mean_list,
    'Std': std_list,
    'Missingness (%)': missingness_list,
    'Feature weight': [feature_to_weight[feat] for feat in feature_importance['Feature']]
})

final_df['Mean (SD)'] = final_df['Mean'].round(2).astype(str) + " ± " + final_df['Std'].round(2).astype(str)
final_df.drop(['Mean','Std'], axis=1, inplace=True)

final_df['Feature weight'] = final_df['Feature weight'].round(4)
final_df['Missingness (%)'] = final_df['Missingness (%)'].round(1)

feature_name_mapping = {
    'output_step': '12-hour total output, mL',
    'SOFA': 'SOFA score',
    'cumulated_balance': 'Cumulative balance, mL',
    'Creatinine': 'Creatinine, mg/dL',
    'Platelets_count': 'Platelet count, ×10^3/µL',
    'Chloride': 'Chloride, mEq/L',
    'BUN': 'BUN, mg/dL',
    'Anion_Gap': 'Anion gap, mEq/L',
    'Calcium': 'Calcium, mg/dL',
    'input_total': 'Total input, mL',
    'WBC_count': 'WBC count, ×10^3/µL',
    'Total_bili': 'Total bilirubin, mg/dL',
    'Phosphorous': 'Phosphorus, mg/dL',
    'O2flow': 'O2 flow, L/min',
    'output_total': 'Total output, mL',
    'Weight_kg': 'Weight, kg',
    'RASS': 'RASS score',
    'Sodium': 'Sodium, mEq/L',
    'Temp_C': 'Temperature, °C',
    'age': 'Age, years',
    'max_dose_vaso': 'Maximum vasopressor dose, µg/kg/min',
    'PAWmean': 'Mean airway pressure, cmH2O',
    'GCS': 'GCS score',
    'SGOT': 'AST (SGOT), U/L',
    'PT': 'PT, s',
    'PTT': 'PTT, s',
    'RBC_count': 'RBC count, ×10^6/µL',
    'LDH': 'LDH, U/L',
    'Ht': 'Hematocrit, %',
    'RR': 'Respiratory rate, breaths/min',
    'HCO3': 'Bicarbonate, mEq/L',
    'SpO2': 'SpO2, %',
    'Ionised_Ca': 'Ionized calcium, mmol/L',
    'Hb': 'Hemoglobin, g/dL',
    'FiO2_1': 'FiO2, %',
    'SGPT': 'ALT (SGPT), U/L',
    'Shock_Index': 'Shock index',
    'Glucose': 'Glucose, mg/dL',
    'HR': 'Heart rate, beats/min',
    'MinuteVentil': 'Minute ventilation, L/min',
    'MeanBP': 'Mean blood pressure, mmHg',
    'INR': 'INR',
    'Potassium': 'Potassium, mEq/L',
    'Fibrinogen': 'Fibrinogen, mg/dL',
    'Arterial_pH': 'Arterial pH',
    'PaO2_FiO2': 'PaO2/FiO2 ratio',
    'TidalVolume': 'Tidal volume, mL',
    'paO2': 'PaO2, mmHg',
    'Albumin': 'Albumin, g/dL',
    'DiaBP': 'Diastolic blood pressure, mmHg',
    'input_step': '12-hour total input, mL',
    'Magnesium': 'Magnesium, mg/dL',
    'SysBP': 'Systolic blood pressure, mmHg',
    'PAWpeak': 'Peak airway pressure, cmH2O',
    'extubated': 'Extubated (yes/no)',
    'Arterial_BE': 'Arterial base excess, mEq/L',
    'PAWplateau': 'Plateau airway pressure, cmH2O',
    'Height_cm': 'Height, cm',
    'CVP': 'cCntral venous pressure, mmHg',
    'paCO2': 'PaCO2, mmHg',
    'Arterial_lactate': 'Arterial lactate, mmol/L',
    'PEEP': 'PEEP, cmH2O',
    'CK_MB': 'CK-MB, ng/mL',
    'ETCO2': 'End-tidal CO2, mmHg',
    'Troponin': 'Troponin, ng/mL',
    'mechvent': 'Mechanical ventilation (yes/no)',
    'Absolute_Neutrophil_Count': 'Absolute neutrophil count, ×10^3/µL',
    'SIRS': 'SIRS criteria',
    'SaO2': 'SaO2, %',
    'Triglyceride': 'Triglycerides, mg/dL',
    'SvO2': 'SvO2, %',
    'PAPsys': 'Pulmonary artery systolic pressure, mmHg',
    'PAPdia': 'Pulmonary artery diastolic pressure, mmHg',
    're_admission': 're-admission (yes/no)',
    'PAPmean': 'Mean pulmonary artery pressure, mmHg',
    'Creatinine_Urine': 'Urine creatinine, mg/dL',
    'gender': 'gender (M/F)',
    'BNP': 'BNP, pg/mL',
    'CRP': 'CRP, mg/L',
    'Urea_Nitrogen_Urine': 'Urine urea nitrogen, mg/dL',
    'Sodium_Urine': 'Urine sodium, mEq/L',
    'Potassium_Urine': 'Urine potassium, mEq/L',
    'Iron': 'Iron, µg/dL',
    'Ammonia': 'Ammonia, µg/dL',
    'Thyroid_Stimulating_Hormone': 'TSH, mIU/L',
    'Total_protein': 'Total protein, g/dL',
    'CI': 'Cardiac index, L/min/m²',
    'ACT': 'ACT, s',
    'T3': 'T3, ng/dL',
    'Gamma_Glutamyltransferase': 'GGT, U/L',
    'Heparin_LMW': 'Low molecular weight heparin (yes/no)',
    'APACHEII_Renal_Failure': 'APACHE II renal failure score',
    'Osmolality_Urine': 'Urine osmolality, mOsm/kg'
}
final_df['Feature'] = final_df['Feature'].map(feature_name_mapping)

output_path = main_path+'/data/model/MIMICtable_features_with_weights.csv'
final_df.to_csv(output_path, index=False)


In [12]:
output_path = main_path+'/data/model/MIMICtable_features_with_weights.xlsx'
final_df.to_excel(output_path, index=False)

In [13]:
icuuniqueids = MIMICtable[C_ICUSTAYID].unique()
train_ids, test_ids = train_test_split(icuuniqueids, train_size=args.train_size, random_state=42)
train_indexes = MIMICtable[MIMICtable[C_ICUSTAYID].isin(train_ids)].index
test_indexes = MIMICtable[MIMICtable[C_ICUSTAYID].isin(test_ids)].index
print("Training: {} IDs ({} rows)".format(len(train_ids), len(train_indexes)))
print("Test: {} IDs ({} rows)".format(len(test_ids), len(test_indexes)))

MIMICraw = MIMICtable[ALL_FEATURE_COLUMNS]

print("Proportion of NA values:", MIMICraw.isna().sum() / len(MIMICraw))

normer = DataNormalization(MIMICtable.iloc[train_indexes])
MIMICzs_train = normer.transform(MIMICtable.iloc[train_indexes])
MIMICzs_test = normer.transform(MIMICtable.iloc[test_indexes])

train_dir = os.path.join(out_dir, "train")
test_dir = os.path.join(out_dir, "test")
if not os.path.exists(train_dir):
    os.mkdir(train_dir)
if not os.path.exists(test_dir):
    os.mkdir(test_dir)

metadata = MIMICtable[[C_BLOC, C_ICUSTAYID, args.outcome_col]].rename({args.outcome_col: C_OUTCOME}, axis=1)

normer.save(os.path.join(out_dir, 'normalization.pkl'))
save_data_files(train_dir,
                MIMICraw.iloc[train_indexes],
                MIMICzs_train,
                metadata.iloc[train_indexes])
save_data_files(test_dir,
                MIMICraw.iloc[test_indexes],
                MIMICzs_test,
                metadata.iloc[test_indexes])    
print("Done.")

Training: 37992 IDs (293094 rows)
Test: 16283 IDs (124469 rows)
Proportion of NA values: gender                       0.000000
mechvent                     0.000000
extubated                    0.640078
max_dose_vaso                0.000000
re_admission                 0.000000
                               ...   
Gamma_Glutamyltransferase    0.998709
input_total                  0.000000
input_step                   0.000000
output_total                 0.000000
output_step                  0.000000
Length: 101, dtype: float64
Done.
