### Prep the data for modeling.

* Take ratio to make new features
* Start with a select number of features
* Create dummy or indicator features for categorical variables
* Standardize the magnitude of numeric features using a scaler
* Split your data into testing and training datasets

In [76]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import import_ipynb
import Data_Cleanup as dc

cwd = os.getcwd()
maindir = os.path.dirname(cwd)
print(maindir)

/Users/maureenkeenan/Desktop/Kaggle/WiDs_Datathon_2020


In [77]:
filepath = os.path.join(maindir,'data/external')
df = pd.read_csv(os.path.join(filepath,'training_v2.csv'))
#Check out the data
df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,aids,cirrhosis,diabetes_mellitus,hepatic_failure,immunosuppression,leukemia,lymphoma,solid_tumor_with_metastasis,apache_3j_bodysystem,apache_2_bodysystem
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Sepsis,Cardiovascular
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,Respiratory,Respiratory
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Metabolic,Metabolic
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Cardiovascular,Cardiovascular
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Trauma,Trauma


In [78]:
# Transform the data
df = dc.transform_data(df)
df.head()

Unnamed: 0,encounter_id,patient_id,hospital_id,hospital_death,age,bmi,elective_surgery,ethnicity,gender,height,...,apache_3j_bodysystem,apache_2_bodysystem,bilirubin,bun,lactate_ind,lactate,has_h1,albumin,creatinine,ACR
0,66154,25312,118,0,68.0,22.73,0,Caucasian,M,180.3,...,Sepsis,Cardiovascular,0.4,31.0,1,1.0,1,2.3,2.51,0.916335
1,114252,59342,81,0,77.0,27.42,0,Caucasian,F,160.0,...,Respiratory,Respiratory,0.5,9.0,1,3.5,1,1.6,0.56,2.857143
2,119783,50777,118,0,25.0,31.95,0,Caucasian,F,172.7,...,Metabolic,Metabolic,0.6,18.0,0,1.0,1,3.0,0.98,3.061224
3,79267,46918,118,0,81.0,22.64,1,Caucasian,F,165.1,...,Cardiovascular,Cardiovascular,0.6,18.0,0,1.0,1,3.0,0.98,3.061224
4,92056,34377,33,0,19.0,,0,Caucasian,M,188.0,...,Trauma,Trauma,0.6,18.0,0,1.0,1,3.0,0.98,3.061224


In [79]:
features = ['hospital_id','age','bmi',
            'ethnicity','elective_surgery','has_h1','icu_admit_source', 'icu_type',
            'apache_3j_bodysystem','apache_3j_diagnosis','apache_2_diagnosis',
            'lactate','ACR','h1_temp_min']

In [80]:
# Split into X (select features) and y
y = df.hospital_death

X = df[features]

In [81]:
X.head()
#fillNA and encode and scale

Unnamed: 0,hospital_id,age,bmi,ethnicity,elective_surgery,has_h1,icu_admit_source,icu_type,apache_3j_bodysystem,apache_3j_diagnosis,apache_2_diagnosis,lactate,ACR,h1_temp_min
0,118,68.0,22.73,Caucasian,0,1,Floor,CTICU,Sepsis,502.01,113.0,1.0,0.916335,37.5
1,81,77.0,27.42,Caucasian,0,1,Floor,Med-Surg ICU,Respiratory,203.01,108.0,3.5,2.857143,36.3
2,118,25.0,31.95,Caucasian,0,1,Accident & Emergency,Med-Surg ICU,Metabolic,703.03,122.0,1.0,3.061224,36.7
3,118,81.0,22.64,Caucasian,1,1,Operating Room / Recovery,CTICU,Cardiovascular,1206.03,203.0,1.0,3.061224,34.8
4,33,19.0,,Caucasian,0,1,Accident & Emergency,Med-Surg ICU,Trauma,601.01,119.0,1.0,3.061224,


In [82]:
X.isnull().sum()

hospital_id                 0
age                      4215
bmi                      3403
ethnicity                1380
elective_surgery            0
has_h1                      0
icu_admit_source            0
icu_type                    0
apache_3j_bodysystem     1646
apache_3j_diagnosis      1087
apache_2_diagnosis       1646
lactate                     0
ACR                         0
h1_temp_min             21678
dtype: int64

In [83]:
from sklearn.model_selection import train_test_split

Xtrain, Xtest, ytrain, ytest = train_test_split(X,y,test_size = 0.2, stratify=y)

In [84]:
# Create function to process Xtrain 
from sklearn.impute import SimpleImputer

def getfillNA(Xdata):
    # Fill NA with means
    fill_h1_temp_min = Xdata['h1_temp_min'].mean()
    fill_age = Xdata['age'].mean()
    fill_apache_2_diagnosis = Xdata['apache_2_diagnosis'].mean()
    fill_apache_3j_diagnosis = Xdata['apache_3j_diagnosis'].mean()
    
    fill_bmi = Xdata['bmi'].median()
    
    fills = (fill_h1_temp_min,fill_age,fill_apache_2_diagnosis,fill_apache_3j_diagnosis,fill_bmi)
    
    return Xdata, fills

def fillNA(Xdata, fills):
    # Fix NA with set values
    Xdata.loc[:, 'ACR'].fillna(0, inplace=True)
    Xdata.loc[:, 'apache_3j_bodysystem'].fillna('NotGiven', inplace=True)
    Xdata.loc[:, 'ethnicity'].fillna('NotGiven', inplace=True)
    
    Xdata.loc[:, 'h1_temp_min'].fillna(fills[0], inplace=True)
    Xdata.loc[:, 'age'].fillna(fills[1], inplace=True)
    Xdata.loc[:, 'apache_2_diagnosis'].fillna(fills[2], inplace=True)
    Xdata.loc[:, 'apache_3j_diagnosis'].fillna(fills[3], inplace=True)
    Xdata.loc[:, 'bmi'].fillna(fills[4], inplace=True)
                        
    return Xdata
    
def encode(Xdata):

    # Encode categorical
    Xdata = pd.get_dummies(Xdata)
    features = Xdata.columns.to_list()
    
    # Scaled numerical
    from sklearn.preprocessing import StandardScaler
    trained_scaler = StandardScaler().fit(Xdata)
    
    return Xdata, trained_scaler, features
    

In [87]:
# Process Xtrain 
# Calculate fill value from training data 
Xtrain, trained_fills = getfillNA(Xtrain)

# Fill with means from training data
Xtrain = fillNA(Xtrain,trained_fills)

# Train scaler and get dummies
Xtrain, trained_scaler, features = encode(Xtrain)

# Transform data with trained scaler
Xtrain = trained_scaler.transform(Xtrain)

IndexError: only integers, slices (`:`), ellipsis (`...`), numpy.newaxis (`None`) and integer or boolean arrays are valid indices

In [86]:
# Use Xtrain data to process Xtest
Xtest, unused = getfillNA(Xtest)
Xtest = fillNA(Xtest,trained_fills)
Xtest, unused, unused = encode(Xtest)
Xtest = trained_scaler.transform(Xtest)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._update_inplace(new_data)


In [88]:
features

['hospital_id',
 'age',
 'bmi',
 'elective_surgery',
 'has_h1',
 'apache_3j_diagnosis',
 'apache_2_diagnosis',
 'lactate',
 'ACR',
 'h1_temp_min',
 'ethnicity_African American',
 'ethnicity_Asian',
 'ethnicity_Caucasian',
 'ethnicity_Hispanic',
 'ethnicity_Native American',
 'ethnicity_NotGiven',
 'ethnicity_Other/Unknown',
 'icu_admit_source_Accident & Emergency',
 'icu_admit_source_Floor',
 'icu_admit_source_Operating Room / Recovery',
 'icu_admit_source_Other Hospital',
 'icu_admit_source_Other ICU',
 'icu_type_CCU-CTICU',
 'icu_type_CSICU',
 'icu_type_CTICU',
 'icu_type_Cardiac ICU',
 'icu_type_MICU',
 'icu_type_Med-Surg ICU',
 'icu_type_Neuro ICU',
 'icu_type_SICU',
 'apache_3j_bodysystem_Cardiovascular',
 'apache_3j_bodysystem_Gastrointestinal',
 'apache_3j_bodysystem_Genitourinary',
 'apache_3j_bodysystem_Gynecological',
 'apache_3j_bodysystem_Hematological',
 'apache_3j_bodysystem_Metabolic',
 'apache_3j_bodysystem_Musculoskeletal/Skin',
 'apache_3j_bodysystem_Neurological',
 '