In [1]:
import pandas as pd
import random as rn
import numpy as np
import re
import pickle
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import ensemble

In [2]:
totDF = pd.read_csv('../data/raw/Cleaned_data_set.csv')

### Clean the df and encode numeric values

In [60]:
def cleanDF (df):
    r1 = re.compile('.*reporting')
    r2 = re.compile('.*imputed')

    cols_to_drop1 = list(filter((r1.match), df.columns))
    cols_to_drop2 = list(filter((r2.match), df.columns))
    cols_to_drop3 = ['admit_NICU']
    cols_to_drop = cols_to_drop1 + cols_to_drop2 + cols_to_drop3

    cols_to_keep = [col for col in df.columns if col not in cols_to_drop]

    X_and_target = df[cols_to_keep + ['admit_NICU']].copy()

    numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
    catDF = X_and_target.select_dtypes(include=object).copy()
    numDF = X_and_target.select_dtypes(include=numerics).copy() #only numeric columns

    le = LabelEncoder()
    catDF = catDF.apply(le.fit_transform)

    concat_df = pd.concat([numDF,catDF],axis=1)
    return concat_df

def pred_results(fullDF, gsf, rf):
    X_bal = fullDF.drop('admit_NICU',axis=1)
    y_pred = rf.predict(X_bal)

    #Use sklearn's confusion_matrix on real and predicted y
    from sklearn.metrics import confusion_matrix
    print(confusion_matrix(fullDF.admit_NICU, y_pred))

    print(
        '''Random Forest
    best param : {0}
    best score : {1}
    r2         : {2}'''\
          .format(gsf.best_params_,\
                  gsf.best_score_, \
                  r2_score(fullDF.admit_NICU, y_pred))
         )
    return

### Create a col of Y and N

In [100]:
cl_df = cleanDF(totDF)
nicu_allY = cl_df.loc[cl_df['admit_NICU']==2]
nicu_allN = cl_df.loc[cl_df['admit_NICU']==0]
nicu_YN = pd.concat([nicu_allY,nicu_allN],axis=0)

### RF on Balanced Sample per year

In [123]:
sample_per_year = 10000
bal_dwnSmplY = nicu_allY.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
bal_dwnSmplN = nicu_allN.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
bal_dwnSmpl = pd.concat([bal_dwnSmplY,bal_dwnSmplN],axis=0)
bal_dwnSmpl.admit_NICU.count()

100000

In [124]:
bal_target = bal_dwnSmpl.admit_NICU
bal_X = bal_dwnSmpl.drop('admit_NICU',axis=1)

In [136]:
def gsForest(X,Y):
    randomForest = ensemble.RandomForestClassifier()
    grid_para_forest = [{
        'n_estimators': np.linspace(50,int(np.sqrt(len(cl_df))),10,dtype=int),
        'min_samples_leaf' : range(100,1000,100)
    }]
    randomForest.set_params(random_state=108)
    balGS_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', 
                                      cv=5, n_jobs=-1)
    %time balGS_forest.fit(X, Y)
    return balGS_forest

In [125]:
randomForest = ensemble.RandomForestClassifier()
grid_para_forest = [{
    'n_estimators': np.linspace(50,int(np.sqrt(len(cl_df))),10,dtype=int),
    'min_samples_leaf' : range(100,1000,100)
}]
randomForest.set_params(random_state=108)
balGS_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', 
                                  cv=5, n_jobs=-1)
%time balGS_forest.fit(bal_X, bal_target)
print(balGS_forest.best_params_)

Wall time: 1h 33min 37s
{'min_samples_leaf': 100, 'n_estimators': 894}


#### Train RF with best params

In [126]:
balRF = ensemble.RandomForestClassifier()
best_params = balGS_forest.best_params_
balRF.set_params(random_state=108,n_jobs= -1,oob_score = True,**best_params)
balRF.fit(bal_X,bal_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=894,
                       n_jobs=-1, oob_score=True, random_state=108, verbose=0,
                       warm_start=False)

In [134]:
pred_results(nicu_YN,balGS_forest,balRF)

[[806777  94698]
 [ 17066  69375]]
Random Forest
    best param : {'min_samples_leaf': 100, 'n_estimators': 894}
    best score : 0.84814
    r2         : -0.4169302959144665


In [127]:
# X_bal = nicu_YN.drop('admit_NICU',axis=1)
# y_pred = balRF.predict(X_bal)

# #Use sklearn's confusion_matrix on real and predicted y
# from sklearn.metrics import confusion_matrix
# print(confusion_matrix(nicu_YN.admit_NICU, y_pred))

# print(
#     '''Random Forest
# sample size: {0}
# best param : {1}
# best score : {2}
# r2         : {3}'''\
#       .format(len(bal_target), \
#               grid_search_forest.best_params_,\
#               grid_search_forest.best_score_, \
#               r2_score(nicu_YN.admit_NICU, y_pred))
#      )

[[806777  94698]
 [ 17066  69375]]
Random Forest
sample size: 100000
best param : {'min_samples_leaf': 100, 'n_estimators': 50}
best score : 0.8276
r2         : -0.4169302959144665


#### Print top 10 most imp features from bal

In [120]:
balRF_coefs = pd.DataFrame({'col' :list(X_bal.columns), 
                           'features': bestRF.feature_importances_})
balRF_coefs.nlargest(10,'features')

Unnamed: 0,col,features
45,obst_est_edit_wk,0.28248
46,birth_weight_gm,0.139571
44,combined_gestation_wk,0.109026
83,assist_vent_immed,0.083637
86,antibiotics_for_newborn,0.07951
39,APGAR_score_5min,0.074666
36,final_delivery_method,0.037392
71,steriods,0.02559
84,assist_vent_after6,0.024213
41,plurality,0.016635


### RF on Samp

In [102]:
sample = 5000
sampN = nicu_allN.sample(sample)
sampY = nicu_allY.sample(sample)
samp = pd.concat([sampN,sampY],axis=0)

In [103]:
samp_target = samp.admit_NICU
samp_X = samp.drop('admit_NICU',axis=1)

In [137]:
samp_gs = gsForest(samp_X,samp_target)
samp_gs

Wall time: 2min 53s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=108,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'

In [None]:
bestRF2 = ensemble.RandomForestClassifier()
best_params = samp_gs.best_params_
bestRF2.set_params(random_state=108,n_jobs= -1,oob_score = True,**best_params)
bestRF2.fit(samp_X,samp_target)

In [138]:
bestRF2 = ensemble.RandomForestClassifier()
best_params = samp_gs.best_params_
bestRF2.set_params(random_state=108,n_jobs= -1,oob_score = True,**best_params)
bestRF2.fit(samp_X,samp_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=108, verbose=0,
                       warm_start=False)

In [139]:
pred_results(nicu_YN,samp_gs,bestRF2)

[[813144  88331]
 [ 20222  66219]]
Random Forest
    best param : {'min_samples_leaf': 100, 'n_estimators': 50}
    best score : 0.8236
    r2         : -0.37622163140548914


In [110]:
randomForest = ensemble.RandomForestClassifier()
grid_para_forest = [{
    'n_estimators': np.linspace(50,int(np.sqrt(len(cl_df))),10,dtype=int),
   # 'n_estimators': range(1000,10000,1000),
    #'min_samples_split' : [100,10,2],
    'min_samples_leaf' : range(100,1000,100)
}]
randomForest.set_params(random_state=108)
sampGS_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', 
                                  cv=5, n_jobs=-1)
%time sampGS_forest.fit(samp_X, samp_target)
print(grid_search_forest2.best_params_)

Wall time: 3min 11s


GridSearchCV(cv=5, error_score='raise-deprecating',
             estimator=RandomForestClassifier(bootstrap=True, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators='warn', n_jobs=None,
                                              oob_score=False, random_state=108,
                                              verbose=0, warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid=[{'

In [111]:
bestRF2 = ensemble.RandomForestClassifier()
best_params = grid_search_forest2.best_params_
bestRF2.set_params(random_state=108,n_jobs= -1,oob_score = True,**best_params)
bestRF2.fit(samp_X,samp_target)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=100, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=108, verbose=0,
                       warm_start=False)

In [119]:
pred_results(nicu_YN,balGS_forest,balRF)

[[813144  88331]
 [ 20222  66219]]
Random Forest
sample size: 5000
best param : {'min_samples_leaf': 100, 'n_estimators': 50}
best score : 0.8276
r2         : 0.30400000000000005


In [122]:
sampRF_coefs = pd.DataFrame({'col' :list(X_samp.columns), 
                           'features': bestRF2.feature_importances_})
sampRF_coefs.nlargest(10,'features')

Unnamed: 0,col,features
45,obst_est_edit_wk,0.262012
46,birth_weight_gm,0.15699
44,combined_gestation_wk,0.110713
39,APGAR_score_5min,0.093226
86,antibiotics_for_newborn,0.087154
83,assist_vent_immed,0.076469
84,assist_vent_after6,0.027749
71,steriods,0.025802
36,final_delivery_method,0.024791
100,infant_transferred,0.01921


### Largest RF run, non balanced DF

In [6]:
print(
    '''Random Forest
sample size: {0}
best param : {1}
best score : {2}
r2         : {3}'''\
      .format(len(encoded_target), \
              grid_search_forest.best_params_,\
              grid_search_forest.best_score_, \
              r2_score(encoded_target, grid_search_forest.predict(cl_df)))
     )

Random Forest
sample size: 100000
best param : {'min_samples_leaf': 100, 'n_estimators': 109}
best score : 0.94302
r2         : 0.33316310727927356


In [17]:
y_pred = bestRF.predict(cl_df)
#Finally, use sklearn's confusion_matrix on real and predicted y

from sklearn.metrics import confusion_matrix
print(confusion_matrix(encoded_target, y_pred))

[[4510    0    2]
 [  60    0    0]
 [ 408    0   20]]
