In [3]:
import pandas as pd
import numpy as np
import re
import pickle
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import ensemble, linear_model

### Functions for cleaning and ML

In [2]:
#Clean the df and encode numeric values
def cleanDF (df):
    r1 = re.compile('.*reporting')
    r2 = re.compile('.*imputed')

    cols_to_drop1 = list(filter((r1.match), df.columns))
    cols_to_drop2 = list(filter((r2.match), df.columns))
    cols_to_drop3 = ['admit_NICU']
    cols_to_drop = cols_to_drop1 + cols_to_drop2 + cols_to_drop3

    cols_to_keep = [col for col in df.columns if col not in cols_to_drop]

    X_and_target = df[cols_to_keep + ['admit_NICU']].copy()

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    catDF = X_and_target.select_dtypes(include=object).copy()
    numDF = X_and_target.select_dtypes(include=numerics).copy() #only numeric columns

    le = LabelEncoder()
    catDF = catDF.apply(le.fit_transform)

    concat_df = pd.concat([numDF,catDF],axis=1)
    return concat_df

def GLM_CV_Lasso (cleanDF,sample_per_year):
    
    dwnSmplDF = cleanDF.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
    encoded_target = dwnSmplDF['admit_NICU']

    glm_lassoCV = linear_model.LogisticRegressionCV(#Cs = int(1e4),
                                                cv = 5,
                                                penalty = 'l1',
                                                solver = 'saga',
                                                n_jobs = -1,
                                                random_state = 108
                                               ).fit(cleanDF, encoded_target)
    print('sample size : %d\n' % (sample_per_year*5))
    %time glm_lassoCV.fit(cleanDF, encoded_target)
    print('\nscore    : {0}'.format(glm_lassoCV.score(cleanDF, encoded_target)))
    print('-'*50)
    return glm_lassoCV

def Forest_CV(X,Y):
    randomForest = ensemble.RandomForestClassifier()
    grid_para_forest = [{
        'n_estimators': np.linspace(50,int(np.sqrt(len(cl_df))),10,dtype=int),
        'min_samples_leaf' : range(100,1000,100)
    }]
    randomForest.set_params(random_state=108)
    gs_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', 
                                      cv=5, n_jobs=-1)
    %time gs_forest.fit(X, Y)
    return gs_forest

def RF (X,Y,gs):
    bestRF = ensemble.RandomForestClassifier()
    best_params = gs.best_params_
    bestRF.set_params(random_state=108,n_jobs= -1,oob_score = True,**best_params)
    bestRF.fit(X,Y)
    return bestRF

def pred_resultsRF(fullDF, gsf, rf):
    X_bal = fullDF.drop('admit_NICU',axis=1)
    y_pred = rf.predict(X_bal)

    #Use sklearn's confusion_matrix on real and predicted y
    from sklearn.metrics import confusion_matrix
    cf = confusion_matrix(fullDF.admit_NICU, y_pred)
    print(cf/len(fullDF)*100)

    print(
        '''Random Forest
    best param : {0}
    best score : {1}
    r2         : {2}'''\
          .format(gsf.best_params_,\
                  gsf.best_score_, \
                  r2_score(fullDF.admit_NICU, y_pred))
         )
    return

def pred_resultsGLM(glm):
    print('sample size: {0} \nscore: {1}\nr2: {2}'
          .format(len(encoded_target), \
            glm.score(cl_df, encoded_target), \
            r2_score(encoded_target, glm.predict(cl_df))))
    return

def feature_select(X,rf):
    sampRF_coefs = pd.DataFrame({'col' :list(X.columns), 
                           'features': rf.feature_importances_})
    return sampRF_coefs.nlargest(20,'features')

### Load data and clean

In [2]:
totDF = pd.read_csv('../data/raw/Cleaned_data_set.csv')
cl_df = cleanDF(totDF)

### GLM

In [None]:
logit_1 = GLM_CV_Lasso(cl_df,20000)

In [None]:
pred_resultsGLM(logit_1)

### Create a col of Y and N

In [4]:
nicu_allY = cl_df.loc[cl_df['admit_NICU']==2]
nicu_allN = cl_df.loc[cl_df['admit_NICU']==0]
nicu_YN = pd.concat([nicu_allY,nicu_allN],axis=0)

### RF on Balanced Sample per year

In [5]:
sample_per_year = 10000
bal_dwnSmplY = nicu_allY.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
bal_dwnSmplN = nicu_allN.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
bal_dwnSmpl = pd.concat([bal_dwnSmplY,bal_dwnSmplN],axis=0)

bal_target = bal_dwnSmpl.admit_NICU #target
bal_X = bal_dwnSmpl.drop('admit_NICU',axis=1) #X

#### Train RF with best params by training a Grid search for hyp param selection

In [6]:
bal_gs = Forest_CV(bal_X,bal_target)
bal_rf = RF(bal_X,bal_target,bal_gs)

Wall time: 1h 40min 16s


In [9]:
pred_resultsRF(nicu_YN,bal_gs,bal_rf)
feature_select(bal_X,bal_rf)

[[808182  93293]
 [ 16978  69463]]
Random Forest
    best param : {'min_samples_leaf': 100, 'n_estimators': 894}
    best score : 0.84804
    r2         : -0.3980022248736994


Unnamed: 0,col,features
45,obst_est_edit_wk,0.212256
46,birth_weight_gm,0.143388
44,combined_gestation_wk,0.129781
39,APGAR_score_5min,0.095569
86,antibiotics_for_newborn,0.093497
83,assist_vent_immed,0.088907
84,assist_vent_after6,0.028562
100,infant_transferred,0.027894
71,steriods,0.023197
36,final_delivery_method,0.022347


#### Save model for later use if needed

In [None]:
bal_100k = '../data/processed/bal100K_model.sav'
pickle.dump(bal_gs, open(bal_100k, 'wb'))