In [22]:
import pandas as pd
import numpy as np
# import random as rn
import sklearn
# from scipy import stats
# import math
import re

from sklearn.preprocessing import  LabelEncoder #OneHotEncoder
from sklearn import ensemble

from sklearn.model_selection import GridSearchCV,cross_val_score
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error,accuracy_score

# from sklearn.ensemble import RandomForestClassifier
from sklearn import linear_model

In [4]:
totDF = pd.read_csv('../../data/processed/Cleaned_Data_Set.csv')

## Cleaning / Sampling

In [8]:
r1 = re.compile('.*reporting')
r2 = re.compile('.*imputed')

cols_to_drop1 = list(filter((r1.match), totDF.columns))
cols_to_drop2 = list(filter((r2.match), totDF.columns))
cols_to_drop3 = ['admit_NICU']
cols_to_drop = cols_to_drop1 + cols_to_drop2 + cols_to_drop3

cols_to_keep = [col for col in totDF.columns if col not in cols_to_drop]

X_and_target = totDF[cols_to_keep + ['admit_NICU']].copy()

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']
catDF = X_and_target.select_dtypes(include=object).copy()
numDF = X_and_target.select_dtypes(include=numerics).copy() #only numeric columns

le = LabelEncoder()
catDF = catDF.apply(le.fit_transform)

concat_df = pd.concat([numDF,catDF],axis=1)

## GLM

In [18]:
import warnings
warnings.filterwarnings('ignore')

In [19]:
for sample_per_year in [100, 1000, 2000, 5000, 10000]:
    dwnSmplDF = concat_df.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
    
    cl_df = dwnSmplDF[cols_to_keep]
    encoded_target = dwnSmplDF['admit_NICU']
    
    logit_1 = linear_model.LogisticRegression(solver = 'lbfgs', multi_class='auto')
    logit_1.set_params(C=1e4)
    print('sample size : %d\n' % (sample_per_year*5))
    %time logit_1.fit(cl_df, encoded_target)
    print('sample size: {0} \nscore      : {1}'.format(len(encoded_target), \
                                                      logit_1.score(cl_df, encoded_target)))
    print('-'*50)
    

sample size : 500

CPU times: user 46.1 ms, sys: 3 ms, total: 49.1 ms
Wall time: 49.7 ms
sample size: 500 
score      : 0.946
--------------------------------------------------
sample size : 5000

CPU times: user 607 ms, sys: 61.2 ms, total: 668 ms
Wall time: 176 ms
sample size: 5000 
score      : 0.9208
--------------------------------------------------
sample size : 10000

CPU times: user 1.21 s, sys: 134 ms, total: 1.34 s
Wall time: 357 ms
sample size: 10000 
score      : 0.9238
--------------------------------------------------
sample size : 25000

CPU times: user 2.91 s, sys: 282 ms, total: 3.19 s
Wall time: 842 ms
sample size: 25000 
score      : 0.92116
--------------------------------------------------
sample size : 50000

CPU times: user 5.86 s, sys: 532 ms, total: 6.39 s
Wall time: 1.69 s
sample size: 50000 
score      : 0.9245
--------------------------------------------------


### RF

In [25]:
for sample_per_year in [100, 200, 500, 1000, 2000, 5000]:
    dwnSmplDF = concat_df.groupby('birth_year',group_keys = False).apply(lambda x: x.sample(sample_per_year))
    
    cl_df = dwnSmplDF[cols_to_keep]
    encoded_target = dwnSmplDF['admit_NICU']
    
    randomForest = ensemble.RandomForestClassifier(random_state = 108, warm_start = True)
    grid_para_forest = [{
        
        # fix the numbers as sample size increases
#         'n_estimators' : range(1000,10000,1000),
        'n_estimators' : np.linspace(10,int(np.sqrt(len(cl_df))),10,dtype=int),
#         'min_samples_split' : [100,10,2],
        'min_samples_leaf' : range(10,100,10)
        
    }]
    randomForest.set_params(random_state=108)
    grid_search_forest = GridSearchCV(randomForest, grid_para_forest, scoring='accuracy', cv=5, n_jobs=-1)
    %time grid_search_forest.fit(cl_df, encoded_target) #put in the df here
    print(
    '''Random Forest
sample size: {0}
best param : {1}
best score : {2}
r2         : {3}'''\
      .format(len(encoded_target), \
              grid_search_forest.best_params_,\
              grid_search_forest.best_score_, \
              r2_score(encoded_target, grid_search_forest.predict(cl_df)))
     )
    print('-'*80)

CPU times: user 712 ms, sys: 82.4 ms, total: 795 ms
Wall time: 3.99 s
Random Forest
sample size: 500
best param : {'min_samples_leaf': 10, 'n_estimators': 12}
best score : 0.918
r2         : 0.3360049829006593
--------------------------------------------------------------------------------
CPU times: user 477 ms, sys: 21.1 ms, total: 498 ms
Wall time: 2.64 s
Random Forest
sample size: 1000
best param : {'min_samples_leaf': 10, 'n_estimators': 21}
best score : 0.925
r2         : 0.3434557735753849
--------------------------------------------------------------------------------
CPU times: user 677 ms, sys: 26.4 ms, total: 704 ms
Wall time: 6.4 s
Random Forest
sample size: 2500
best param : {'min_samples_leaf': 10, 'n_estimators': 27}
best score : 0.942
r2         : 0.404304039263532
--------------------------------------------------------------------------------
CPU times: user 2.05 s, sys: 139 ms, total: 2.19 s
Wall time: 16.8 s
Random Forest
sample size: 5000
best param : {'min_samples