### Prep the data for modeling.

* Take ratio to make new features
* Start with a select number of features
* Create dummy or indicator features for categorical variables
* Standardize the magnitude of numeric features using a scaler
* Split your data into testing and training datasets

In [1]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import import_ipynb
import Data_Cleanup as dc

cwd = os.getcwd()
maindir = os.path.dirname(cwd)
print(maindir)

importing Jupyter notebook from Data_Cleanup.ipynb
/Users/maureenkeenan/Desktop/Kaggle/WiDs_Datathon_2020


In [2]:
filepath = os.path.join(maindir,'data')
df = pd.read_csv(os.path.join(filepath,'InterimData.csv'))

#Check out the data
df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,Survived,has_h1,has_d1,has_both,has_inv,has_lmin,lactate_min,creatinine,albumin,ACR
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,Survived,1,1,2,1,1,1.0,2.3,2.3,1.0
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,Survived,1,1,2,1,1,3.5,1.6,1.6,1.0
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,Survived,1,1,2,1,1,1.0,,3.0,
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,Survived,1,1,2,1,1,1.0,,3.0,
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,Survived,1,1,2,1,1,1.0,,3.0,


In [3]:
#Check the table to see if it has the NaN corrections
df.isnull().sum()

encounter_id          0
patient_id            0
hospital_id           0
hospital_death        0
age                4228
                  ...  
has_lmin              0
lactate_min           0
creatinine        48228
albumin               0
ACR               48228
Length: 196, dtype: int64

In [5]:
# Transform the data
df = dc.transform_data(df)

In [8]:
features = ['hospital_id','age','bmi',
            'ethnicity','elective_surgery','has_h1','icu_admit_source', 'icu_type',
            'apache_3j_bodysystem','apache_3j_diagnosis','apache_2_diagnosis',
            'lactate_min','ACR','h1_temp_min']

In [9]:
# Split into X (select features) and y
y = df.hospital_death

X = df[features]

In [10]:
X.head()
#fillNA and encode and scale

Unnamed: 0,hospital_id,age,bmi,ethnicity,elective_surgery,has_h1,icu_admit_source,icu_type,apache_3j_bodysystem,apache_3j_diagnosis,apache_2_diagnosis,lactate_min,ACR,h1_temp_min
0,118,68.0,22.73,Caucasian,0,1,Floor,CTICU,Sepsis,502.01,113.0,1.0,1.0,37.5
1,81,77.0,27.42,Caucasian,0,1,Floor,Med-Surg ICU,Respiratory,203.01,108.0,3.5,1.0,36.3
2,118,25.0,31.95,Caucasian,0,1,Accident & Emergency,Med-Surg ICU,Metabolic,703.03,122.0,1.0,,36.7
3,118,81.0,22.64,Caucasian,1,1,Operating Room / Recovery,CTICU,Cardiovascular,1206.03,203.0,1.0,,34.8
4,33,19.0,27.654655,Caucasian,0,1,Accident & Emergency,Med-Surg ICU,Trauma,601.01,119.0,1.0,,


In [14]:
X.isnull().sum()

hospital_id                 0
age                         0
bmi                         0
ethnicity                   0
elective_surgery            0
has_h1                      0
icu_admit_source          112
icu_type                    0
apache_3j_bodysystem     1662
apache_3j_diagnosis      1101
apache_2_diagnosis       1662
lactate_min                 0
ACR                     48228
h1_temp_min             21732
dtype: int64

In [19]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.2, stratify=y)

In [29]:
# Create function to process Xtrain 
def pipeline(Xdata):
    # Fix NA
    Xdata['ACR'] = Xdata['ACR'].fillna(0)
    Xdata['h1_temp_min'] = Xdata['h1_temp_min'].fillna(np.mean(Xdata['h1_temp_min']))
    Xdata = Xdata.dropna(axis=0)
    
    # Encode categorical
    Xdata = pd.get_dummies(Xdata)
    
    # Scaled numerical
    from sklearn.preprocessing import StandardScaler
    trained_scaler = StandardScaler().fit(Xdata)
    
    return Xdata, trained_scaler   
    

In [34]:
# Process Xtrain
Xtrain, scaler = pipeline(Xtrain)
Xtrain_transformed = scaler.transform(Xtrain)

# Process Xtest
Xtest, unused = pipeline(Xtest)
Xtest_transformed = scaler.transform(Xtest)