In [1]:
import pandas as pd
import random as rn
import numpy as np
import re
from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn import ensemble

In [2]:
totDF = pd.read_csv('../data/raw/Cleaned_data_set.csv')

In [27]:
#downsample
sample_per_year = 100000
dwnSmplDF = totDF.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))

In [28]:
r1 = re.compile('.*reporting')
r2 = re.compile('.*imputed')

#to drop reporting, imputed and target column
cols_to_drop1 = list(filter((r1.match), totDF.columns))
cols_to_drop2 = list(filter((r2.match), totDF.columns))
cols_to_drop3 = ['admit_NICU']
cols_to_drop = cols_to_drop1 + cols_to_drop2 + cols_to_drop3

#columns to keep
cols_to_keep = [col for col in totDF.columns if col not in cols_to_drop]

#create df w/ columns to keep and target admit_nicu
X = dwnSmplDF[cols_to_keep].copy()
target = dwnSmplDF[['admit_NICU']].copy()

#select numeric and cat columns
numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
catDF = X.select_dtypes(include=object).copy()
numDF = X.select_dtypes(include=numerics).copy()

#Encode cat columns
le = LabelEncoder()
catDF = catDF.apply(le.fit_transform)
encoded_target = target.apply(le.fit_transform)

#concat numeric and encoded df
cl_df = pd.concat([numDF,catDF],axis=1)

In [None]:
randomForest = ensemble.RandomForestClassifier()
grid_para_forest = [{
    'n_estimators': np.linspace(50,int(np.sqrt(len(cl_df))),10,dtype=int),
   # 'n_estimators': range(1000,10000,1000),
    #'min_samples_split' : [100,10,2],
    'min_samples_leaf' : range(100,1000,100)
}]
randomForest.set_params(random_state=108)
grid_search_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', cv=5, n_jobs=-1)
%time grid_search_forest.fit(cl_df, encoded_target)

In [13]:
print(
    '''Random Forest
sample size: {0}
best param : {1}
best score : {2}
r2         : {3}'''\
      .format(len(encoded_target), \
              grid_search_forest.best_params_,\
              grid_search_forest.best_score_, \
              r2_score(encoded_target, grid_search_forest.predict(cl_df)))
     )

Random Forest
sample size: 10000
best param : {'min_samples_leaf': 10, 'n_estimators': 50}
best score : 0.9471
r2         : 0.43563605469124156


In [20]:
bestRF = ensemble.RandomForestClassifier()
best_params = grid_search_forest.best_params_
bestRF.set_params(best_params)
bestRF.set_params(random_state=108,n_jobs= -1,oob_score = True)
bestRF.fit(cl_df,encoded_target)

  after removing the cwd from sys.path.


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=50, n_jobs=-1,
                       oob_score=True, random_state=108, verbose=0,
                       warm_start=False)

In [26]:
print(
'''Random Forest
OOB score: {0}
Feature Importances: {1}
Score: {2}'''\
        .format(bestRF.oob_score_,
                bestRF.feature_importances_,
                bestRF.score))

Random Forest
OOB score: 0.9453
Feature Importances: [2.67334658e-03 5.05518498e-03 8.95358333e-03 3.61323098e-03
 8.41193731e-03 5.80861044e-03 1.07334275e-03 1.43582187e-03
 1.71387008e-03 2.84158722e-03 2.35183189e-03 1.31102740e-03
 3.33484680e-03 6.52643499e-03 2.48524092e-03 2.46499074e-03
 2.55026924e-03 3.90223752e-03 2.81073006e-03 8.07718846e-05
 1.05262174e-03 7.17873409e-03 2.05568965e-03 5.01173961e-03
 8.61493657e-03 3.40719712e-04 3.15997645e-03 1.23689644e-03
 4.19281551e-03 9.01583613e-03 7.79754067e-03 1.35611861e-02
 7.65778470e-04 2.60899990e-03 2.19605989e-04 3.66373893e-03
 1.73394293e-02 7.33542448e-04 1.72250624e-03 2.86597353e-02
 5.80495928e-03 1.05017417e-02 5.48686756e-03 2.89297015e-03
 7.93460317e-02 1.92508127e-01 1.09594346e-01 2.33364790e-02
 1.56689313e-03 1.97351630e-03 3.51087445e-03 1.44193939e-03
 7.03524291e-04 1.09668529e-03 6.97370984e-04 4.00837748e-04
 8.45738045e-04 1.19191972e-03 9.05914347e-05 4.86356164e-04
 2.06495575e-04 3.61283915e-03 1