# Building up on Question 2.1.1 

In [None]:
# Core Libraries
import os
import glob
import itertools

# Data Handling
import numpy as np
import pandas as pd
import pyarrow as pa
import pyarrow.parquet as pq

# Progress Bar
from tqdm import tqdm

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import export_graphviz
from IPython.display import Image
from sklearn.metrics import ConfusionMatrixDisplay, roc_curve, auc

# Statistics
from scipy.stats import skew, kurtosis, randint

# Modeling
from sklearn.preprocessing import MinMaxScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    average_precision_score,
    recall_score,
    confusion_matrix,
    roc_auc_score,
)
from sklearn import linear_model
from sklearn.linear_model import LogisticRegression


In [2]:

static_variables = ['RecordID', 'Age', 'Gender', 'Height', 'ICUType', 'Weight']

static_variables.remove('ICUType')

static_variables_we_want = ['Age', 'Gender', 'Height', 'Weight']
all_variables = ['Weight', 'Age', 'TroponinI', 'DiasABP', 'MechVent', 'HCO3', 'Cholesterol', 'HCT', 'SaO2', 'WBC', 'SysABP', 'Urine', 'ICUType', 'Gender', 'ALP', 'Creatinine', 'K', 'AST', 'Glucose', 'RespRate', 'MAP', 'FiO2', 'BUN', 'Na', 'Bilirubin', 'TroponinT', 'PaCO2', 'Height', 'GCS', 'HR', 'pH', 'PaO2', 'Lactate', 'ALT', 'NISysABP', 'RecordID', 'Platelets', 'Temp', 'Mg', 'NIDiasABP', 'Albumin', 'NIMAP']
dyn_variables = [x for x in all_variables if x not in static_variables]
dyn_variables.remove('ICUType')
dyn_variables.append('Weight_VAR')
len(dyn_variables), len(static_variables_we_want)

initial_column_lists = static_variables_we_want + dyn_variables

In [3]:

# import parquet file 
df_a =pd.read_parquet('data/processed_raw_data_set-a_1.parquet', engine='pyarrow')
df_b =pd.read_parquet('data/processed_raw_data_set-b_1.parquet', engine='pyarrow')
df_c =pd.read_parquet('data/processed_raw_data_set-c_1.parquet', engine='pyarrow')

drop_ICUType = True 
if drop_ICUType:
    df_a = df_a.drop(columns=['ICUType'])
    df_b = df_b.drop(columns=['ICUType'])
    df_c = df_c.drop(columns=['ICUType'])


#  drop Time variable in df_a
if 'Time' in df_a.columns:
    df_a = df_a.drop(columns=['Time'])
    df_b = df_b.drop(columns=['Time'])
    df_c = df_c.drop(columns=['Time'])




## Computing features vectors of our patient

Instead of working on the table where the missing values had already been filled, i prefer working on the not filled table because otherwise filled values would be taken into the mean and might flatten patient with lots of missing values. Then I compute the mean of variables for eahc patient over the 49 timestamps. 

Then i compute the median on the resulting table to fill the missing values

In [4]:
len(static_variables_we_want) , len(dyn_variables), len(static_variables_we_want) + len(dyn_variables) 

(4, 37, 41)

In [5]:
# we define aggregation functions
agg_funcs = {col: ['mean','std','max','min','skew'] for col in dyn_variables}  

agg_funcs['RecordID'] = 'first'  # Keep RecordID
for stat_var in static_variables_we_want:
    if stat_var in df_a.columns:
        agg_funcs[stat_var] = 'first'  # Keep static variables

# Compute mean and std in one go
df_a_agg = df_a.groupby('RecordID').agg(agg_funcs)

df_a_agg.columns = ['_'.join(col).strip() for col in df_a_agg.columns.values]


In [6]:
# same for df_b 
agg_funcs = {col: ['mean','std','max','min','skew'] for col in dyn_variables}
agg_funcs['RecordID'] = 'first'  # Keep RecordID
for stat_var in static_variables_we_want:
    if stat_var in df_b.columns:
        agg_funcs[stat_var] = 'first'  # Keep static variables

# Compute mean and std in one go
df_b_agg = df_b.groupby('RecordID').agg(agg_funcs)

df_b_agg.columns = ['_'.join(col).strip() for col in df_b_agg.columns.values]


In [7]:
# same for df_c
agg_funcs = {col: ['mean','std','max','min','skew'] for col in dyn_variables}
agg_funcs['RecordID'] = 'first'  # Keep RecordID
for stat_var in static_variables_we_want:
    if stat_var in df_c.columns:
        agg_funcs[stat_var] = 'first'  # Keep static variables

# Compute mean and std in one go
df_c_agg = df_c.groupby('RecordID').agg(agg_funcs)
df_c_agg.columns = ['_'.join(col).strip() for col in df_c_agg.columns.values]

In [8]:
# compute median of df_a 

df_a_agg_median = df_a_agg.median()

# fill missing values with median
df_a_agg.fillna(df_a_agg_median, inplace=True)

df_b_agg.fillna(df_a_agg_median, inplace=True)
df_c_agg.fillna(df_a_agg_median, inplace=True)

In [9]:
# (df_a_agg.isnull().sum() != 0 ) print where true
# print columns with missing values
missing_values_a = df_a_agg.isnull().sum() != 0
#  print only where true
missing_values_a = missing_values_a[missing_values_a].index.tolist()
print("Missing values in df_a_agg:", missing_values_a)

Missing values in df_a_agg: ['Cholesterol_skew']


In [10]:
missing_values_b = df_b_agg.isnull().sum() != 0
#  print only where true
missing_values_b = missing_values_b[missing_values_b].index.tolist()
print("Missing values in df_b_agg:", missing_values_b)

Missing values in df_b_agg: ['Cholesterol_skew']


In [11]:
missing_values_c = df_c_agg.isnull().sum() != 0
#  print only where true
missing_values_c = missing_values_c[missing_values_c].index.tolist()
print("Missing values in df_c_agg:", missing_values_c)

Missing values in df_c_agg: ['Cholesterol_skew']


In [12]:
# drop Cholesterol_skew in df_a_agg, df_b_agg, df_c_agg
df_a_agg = df_a_agg.drop(columns=['Cholesterol_skew'])
df_b_agg = df_b_agg.drop(columns=['Cholesterol_skew'])
df_c_agg = df_c_agg.drop(columns=['Cholesterol_skew'])

In [13]:
len(df_a_agg['Weight_VAR_mean'].unique()), len(df_a_agg['Weight_VAR_std'].unique()) 

(2110, 1471)

In [14]:
assert df_a_agg.isnull().sum().sum() == 0
assert df_b_agg.isnull().sum().sum() == 0
assert df_c_agg.isnull().sum().sum() == 0

In [None]:
# save dataframes to parquet files
df_a_agg.to_parquet('data/set_a_for_q2_1_more_feat.parquet', engine='pyarrow', index=False)
df_b_agg.to_parquet('data/set_b_for_q2_1_more_feat.parquet', engine='pyarrow', index=False)
df_c_agg.to_parquet('data/set_c_for_q2_1_more_feat.parquet', engine='pyarrow', index=False)