In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn_pandas import DataFrameMapper
from sklearn import preprocessing
from sklearn.model_selection import KFold
import os

# Functions

In [2]:
def load_dynamic_data(df_dynamic, keys, apply_masking, mask_value=666):    
    # Convert some features into binary
    aux = df_dynamic[keys]
    df_dynamic = df_dynamic.drop(columns=keys)
    df_dynamic = df_dynamic.astype(bool).astype(int)
    df_dynamic = pd.concat([aux, df_dynamic], axis=1)
    
    # Apply the mask
    if apply_masking:
        df_dynamic_nomask = df_dynamic[["Admissiondboid", "dayToDone", "individualMRGerm"]]
        df_dynamic_mask = df_dynamic.drop(columns=["Admissiondboid", 'dayToDone', "individualMRGerm"])
        df_dynamic_mask.at[df_dynamic_mask["mask"] == 0, df_dynamic_mask.keys()] = mask_value
        df_dynamic = df_dynamic_nomask.join(df_dynamic_mask)
#         df_dynamic = df_dynamic.drop(columns=["mask"])
    
    return df_dynamic

In [3]:
def load_static_data(df_static, keys, categorical_keys):
    # Eliminate the non-interesting features and get only one sample per patient
    df_static = df_static[keys].drop_duplicates()
    
    # Fill the NaNs
    if "SAPSIIIScore" in df_static:
        df_static["SAPSIIIScore"] = df_static.SAPSIIIScore.fillna(df_static.SAPSIIIScore.mean())

    # Reduce the dimensionality of two categorical features
    if 'Origin' in df_static:
        booleanMapping = (df_static.Origin == "rehabilitation") | (df_static.Origin == "hemodynamics") | \
        (df_static.Origin == "cma") | (df_static.Origin == "ICU")  | (df_static.Origin == "paediatrics")  | \
        (df_static.Origin == "obstetrics")  | (df_static.Origin == "anaesthesia")  | (df_static.Origin == "other floor") | \
         (df_static.Origin == "gynaecology") | (df_static.Origin == "psychiatry") | (df_static.Origin == "dermatology")
        df_static.Origin = df_static.Origin.where(~(booleanMapping), "others")
    if 'ReasonAdmission' in df_static:
        booleanMapping = (df_static.ReasonAdmission == "endocrine other") | (df_static.ReasonAdmission == "coagulopatía") | \
        (df_static.ReasonAdmission == "obstetric pathology") | (df_static.ReasonAdmission == "infection other")  | \
        (df_static.ReasonAdmission == "other")  | (df_static.ReasonAdmission == "hydroelectrolytic alteration")  | \
        (df_static.ReasonAdmission == "respiratory other")  | (df_static.ReasonAdmission == "severe trauma") | \
         (df_static.ReasonAdmission == "hepatic insufficiency") | (df_static.ReasonAdmission == "diabetic decompensation") | \
        (df_static.ReasonAdmission == "neuromuscular") | (df_static.ReasonAdmission == "severe arrhythmia")
        df_static.ReasonAdmission = df_static.ReasonAdmission.where(~(booleanMapping), "others")
    
    # Convert the categories of categorical features into numbers
    for i in range(len(categorical_keys)):
        df_static[categorical_keys[i]] = pd.factorize(df_static[categorical_keys[i]])[0] + 1
        
    return df_static

In [4]:
def split_patients(X, ratio=0.7, idPatient='Admissiondboid', seed=42):
    """
    This function split all the samples of the patients in train and test.
    """
    patients = np.array(X[[idPatient]].drop_duplicates())
    patients_to_train = pd.DataFrame(data=patients).sample(frac=ratio, random_state=seed).values[:, 0]
    X_train = X[X.Admissiondboid.isin(patients_to_train)]
    X_test = X[~X.Admissiondboid.isin(patients_to_train)]
    return X_train, X_test

In [5]:
def my_standard_scaler(
    X_train, X_test,
    non_scalable_features,
    apply_masking=False, mask_value=666
):
    """
    This function implements a standard scaler.
    """
    if apply_masking:
        X_train_norm = X_train[X_train["mask"] != mask_value]
        X_train_nonorm = X_train[X_train["mask"] == mask_value]
        X_test_norm = X_test[X_test["mask"] != mask_value]
        X_test_nonorm = X_test[X_test["mask"] == mask_value]
    else:
        X_train_norm = X_train.copy()
        X_test_norm = X_test.copy()
    
    # Scale in train
    scaler = preprocessing.StandardScaler()
    df_aux = X_train_norm[non_scalable_features]
    X_train_norm = X_train_norm.drop(columns=non_scalable_features)
    mapper = DataFrameMapper([(X_train_norm.columns, scaler)])
    scaled_features = mapper.fit_transform(X_train_norm.copy(), 4)
    scaled_X_train = pd.DataFrame(scaled_features, index=X_train_norm.index, columns=X_train_norm.columns)
    scaled_X_train = scaled_X_train.join(df_aux)

    # Scale in test
    df_aux = X_test_norm[non_scalable_features]
    X_test_norm = X_test_norm.drop(columns=non_scalable_features)
    scaled_features = mapper.transform(X_test_norm.copy())                                        
    scaled_X_test = pd.DataFrame(scaled_features, index=X_test_norm.index, columns=X_test_norm.columns)
    scaled_X_test = scaled_X_test.join(df_aux)

    if apply_masking:
        df_final_train = pd.concat([scaled_X_train, X_train_nonorm])
        df_final_test = pd.concat([scaled_X_test, X_test_nonorm])
        return df_final_train, df_final_test
    else:
        return scaled_X_train, scaled_X_test

In [6]:
def dataframe_to_tensor(df, y, eliminateColumn, columns, timeStepLength):
    _, idx = np.unique(df.Admissiondboid, return_index=True)
    listPatients = np.array(df.Admissiondboid)[np.sort(idx)]

    index = df.index
    y = y.reindex(index)
    y = y.drop_duplicates(subset="Admissiondboid")
    # y = y.drop(columns=["Admissiondboid"])

    for i in range(len(listPatients)):
        df_trial = df[df.Admissiondboid == listPatients[i]]
        if eliminateColumn:
            df_trial = df_trial.drop(columns=columns)
        if i == 0:
            X = np.array(df_trial)
            X = X.reshape(1, timeStepLength, df.shape[1] - len(columns))
        else:
            X_2 = np.array(df_trial)
            X_2 = X_2.reshape(1, timeStepLength, df.shape[1] - len(columns))
            X = np.append(X, X_2, axis=0)
    return X, y

In [7]:
def reorder_static_data(X, y):
    X_train_static = pd.merge(X, y.reset_index().Admissiondboid, how="right")
    X_train_scaled, _ = my_standard_scaler(
        X_train_static, X_train_static, 
        ['Admissiondboid',  'Origin', 'ReasonAdmission', 'PatientCategory'],
        apply_masking=False
    )
    X_train_scaled = X_train_scaled[[
        'Age', 'Gender', 'SAPSIIIScore', 'MonthOfAdmission', 'YearOfAdmission',
        'Origin', 'ReasonAdmission', 'PatientCategory'
    ]]
    return X_train_scaled

# Load data

In [8]:
df_completo = pd.read_csv("../../df_dynamic_20days_v10.csv")
df_MR = pd.read_csv("../../df_estaticas_v2_covid.csv", low_memory=False)

# Preprocessing

### Eliminate COVID patients

In [9]:
patients = np.unique(df_MR[df_MR.COVID != "zero"].Admissiondboid)
df_completo = df_completo[~df_completo.Admissiondboid.isin(patients)]
df_MR = df_MR[~df_MR.Admissiondboid.isin(patients)]

In [10]:
df_completo[["Admissiondboid", "individualMRGerm"]].drop_duplicates().individualMRGerm.value_counts()

0    2553
1     605
Name: individualMRGerm, dtype: int64

### Window

In [11]:
n_timesteps = 14
df_windowed = df_completo[df_completo.dayToDone.isin(np.arange(0, n_timesteps, 1))]

# Static features

In [12]:
keys = [
    'Admissiondboid', 
    'Age', 'Gender','Origin', 'ReasonAdmission', 'PatientCategory', 
    'SAPSIIIScore',            
    'MonthOfAdmission', 'YearOfAdmission'
       ]
categorical_keys = ["Origin", "ReasonAdmission", "PatientCategory"]

df_static = load_static_data(df_MR, keys, categorical_keys)

In [13]:
keys = [
    'Admissiondboid', 'dayToDone',
    'numberOfPatients', 'numberOfPatientsMR',
    'neighbor_PAP', 'neighbor_CAR', 'neighbor_Falta', 'neighbor_QUI', 'neighbor_ATF', 'neighbor_GLI', 'neighbor_PEN',
    'neighbor_CF3', 'neighbor_CF4', 'neighbor_OXA', 'neighbor_NTI', 'neighbor_LIN', 'neighbor_SUL', 'neighbor_AMG',
    'neighbor_CF1', 'neighbor_MAC', 'neighbor_POL', 'neighbor_MON', 'neighbor_GCC', 'neighbor_TTC', 'neighbor_OTR',
    'neighbor_LIP', 'neighbor_CF2', 'neighbor_ATI', 'neighbor_IBL', 'neighbor_ATP', 
    'mask'
       ]

df_dynamic = load_dynamic_data(df_windowed, keys, apply_masking=True, mask_value=666)

# APPLY THE PERMUTATION


**This file will generate the 5 static features for the MLP, as well as the 5 MTS for the GRU model**

In [14]:
# APPLY THE PERMUTATION
permutation_static = pd.read_csv('../../0_Results_FS_PFI/MLP_selected_features.csv')
permutation_static = permutation_static['Selected Feature'].to_list()
permutation_static

['SAPSIIIScore', 'Age', 'YearOfAdmission', 'Gender', 'ReasonAdmission']

In [15]:
permutation = pd.read_csv('../../0_Results_FS_PFI/GRU_selected_features.csv')
permutation = permutation['Selected Feature'].to_list()
permutation

['isVM', 'PEN', 'neighbor_QUI', 'neighbor_CAR', 'neighbor_SUL']

In [16]:
# APPLY THE PERMUTATION
df_dynamic = df_dynamic[[
    'Admissiondboid', 'dayToDone', 
    
     permutation[0],
     permutation[1],
     permutation[2],
     permutation[3],
     permutation[4],
    
    'mask', 
    'individualMRGerm'
]]

In [17]:
print(df_dynamic.Admissiondboid.unique().shape)
print(df_static.Admissiondboid.unique().shape)

df_dynamic = df_dynamic[df_dynamic.Admissiondboid.isin(df_static.Admissiondboid)]
df_static = df_static[df_static.Admissiondboid.isin(df_dynamic.Admissiondboid)]

print(df_dynamic.Admissiondboid.unique().shape)
print(df_static.Admissiondboid.unique().shape)

(3158,)
(3808,)
(3158,)
(3158,)


In [18]:
non_scalable_features = ['Admissiondboid', "dayToDone", "individualMRGerm", "mask"]
seeds = [142, 56, 78, 97]
n_kfold = 5
ratio_train_test = 0.8

for i in range(len(seeds)):
    # Split train and test
    X_train, X_test = split_patients(df_dynamic, ratio=ratio_train_test, seed=seeds[i])

    # Normalize
    X_train_scaled, X_test_scaled = my_standard_scaler(
        X_train, X_test, 
        non_scalable_features,                                               
        apply_masking=True
    )

    # Reorder df for static features
    X_train_static = pd.merge(df_static, X_train_scaled.Admissiondboid, how="right")
    X_test_static = pd.merge(df_static, X_test_scaled.Admissiondboid, how="right")

    # Normalize static features
    X_train_static, X_test_static = my_standard_scaler(
        X_train_static, X_test_static, 
        ['Admissiondboid',  'Origin', 'ReasonAdmission', 'PatientCategory'],
        apply_masking=False
    )

    # Execute cross-validation
    all_patients_train = np.unique(X_train_scaled.Admissiondboid)
    kf = KFold(n_splits=n_kfold, shuffle=True, random_state=seeds[i])
    kf.get_n_splits(all_patients_train)
    
    j = 0
    for train_index, val_index in kf.split(np.unique(X_train_scaled.Admissiondboid)):
        # Split train into train' and validation
        patients_train = all_patients_train[train_index]
        patients_val =  all_patients_train[val_index]
        X_train_splitted = X_train_scaled[X_train_scaled.Admissiondboid.isin(patients_train)]
        X_val_splitted = X_train_scaled[X_train_scaled.Admissiondboid.isin(patients_val)]

        # Convert to time tensors
        X_train_tensor, y_train = dataframe_to_tensor(
            X_train_splitted, X_train_splitted[["Admissiondboid", "individualMRGerm"]], 
            eliminateColumn=True, columns=["dayToDone"], 
            timeStepLength=n_timesteps
        )
        X_val_tensor, y_val = dataframe_to_tensor(
            X_val_splitted, X_val_splitted[["Admissiondboid", "individualMRGerm"]], 
            eliminateColumn=True, columns=["dayToDone"], 
            timeStepLength=n_timesteps
        )

        # Reorder static features and retain a single sample per patient
        X_train_static_splitted = pd.merge(X_train_static, y_train.Admissiondboid, how="right")
        X_val_static_splitted = pd.merge(X_train_static, y_val.Admissiondboid, how="right")
        X_train_static_splitted = X_train_static_splitted.groupby(["Admissiondboid"]).mean().reset_index()
        X_train_static_splitted = X_train_static_splitted.drop(columns=["Admissiondboid"])
        X_val_static_splitted = X_val_static_splitted.groupby(["Admissiondboid"]).mean().reset_index()
        X_val_static_splitted = X_val_static_splitted.drop(columns=["Admissiondboid"]) 

        # Eliminate admissiondboid, label, and mask from tensors
        X_train_tensor = np.delete(X_train_tensor, [-1, -2, -3], axis=2)
        X_val_tensor = np.delete(X_val_tensor, [-1, -2, -3], axis=2)

        # Reorder columns of static features
        X_train_static_splitted = X_train_static_splitted[[ permutation_static[-5], permutation_static[-4],
                                                           permutation_static[-3], permutation_static[-2], permutation_static[-1] ]]
        
        X_val_static_splitted = X_val_static_splitted[[ permutation_static[-5], permutation_static[-4], 
                                                       permutation_static[-3], permutation_static[-2], permutation_static[-1] ]]

        # Create directories if they do not exist
        folder_path = "../../splits_" + str(14) + "_days/PFI_NM/5_features/split_" + str(i)
        os.makedirs(folder_path, exist_ok=True)

        # Save the data
        np.save(folder_path + "/X_train_tensor_" + str(j), X_train_tensor)
        X_train_static_splitted.to_csv(folder_path + "/X_train_static_" + str(j) + ".csv")
        y_train.to_csv(folder_path + "/y_train_" + str(j) + ".csv", index=False)

        np.save(folder_path + "/X_val_tensor_" + str(j), X_val_tensor)
        X_val_static_splitted.to_csv(folder_path + "/X_val_static_" + str(j) + ".csv")
        y_val.to_csv(folder_path + "/y_val_" + str(j) + ".csv", index=False)
        
        j += 1

    # Convert test to tensor
    X_test_tensor, y_test = dataframe_to_tensor(
        X_test_scaled, X_test_scaled[["Admissiondboid", "individualMRGerm"]], 
        eliminateColumn=True, columns=["dayToDone"], 
        timeStepLength=n_timesteps
    )

    # Eliminate admissiondboid, label, and mask from test tensor
    X_test_tensor = np.delete(X_test_tensor, [-1, -2, -3], axis=2)

    # Retain a single sample per patient for static test features
    X_test_static = X_test_static.groupby(["Admissiondboid"]).mean().reset_index()
    X_test_static = X_test_static.drop(columns=["Admissiondboid"])

    # Reorder columns of static test features
    X_test_static = X_test_static[[ permutation_static[-5], permutation_static[-4], permutation_static[-3], 
                                   permutation_static[-2], permutation_static[-1] ]]

    # Save test data
    np.save(folder_path + "/X_test_tensor", X_test_tensor)
    X_test_static.to_csv(folder_path + "/X_test_static.csv")
    y_test.to_csv(folder_path + "/y_test.csv", index=False)