In [194]:
## Packages
# system
import os
# data wrangling
import numpy as np
import pandas as pd
from scipy import stats
from dfply import *
import statistics
# plotting
import matplotlib.pyplot as plt
import seaborn as sns
# models
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split, KFold, StratifiedKFold, cross_val_score 
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, recall_score, make_scorer, accuracy_score, balanced_accuracy_score, f1_score, precision_score
## Settings
%matplotlib inline
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None) 
pd.set_option('display.width', 1000)

## Data Import

In [180]:
df = pd.read_csv('../processed_data/provider_final_new.csv')
df.drop(['Unnamed: 0'], axis=1, inplace=True)
df = df.set_index('Provider')
df.head(3)

Unnamed: 0_level_0,PotentialFraud,DurationMedianIP,DurationMeanIP,DurationMedianOP,DurationMeanOP,Serves,ChronCondsMedian,ChronCondsMean,ChronConsMode,PtAgeMedian,PtAgeMean,NumPtsIP,NumPtsOP,NumStatesIP,NumStatesOP,SharePhysician,MergeAvgClaimReimbursed,NumTopCodeClaim,DupClaimNum_IP,DupClaimRatio_IP,DupClaim_IP,DupClaimStatesNum_IP,DupClaimMultiState_IP,DupClaimProviderNum_IP,DupClaimMultiProvider_IP,DupClaimStartDtNum_IP,DupClaimMultiStartDt_IP,DupClaimNum_OP,DupClaimRatio_OP,DupClaim_OP,DupClaimStatesNum_OP,DupClaimMultiState_OP,DupClaimProviderNum_OP,DupClaimMultiProvider_OP,DupClaimStartDtNum_OP,DupClaimMultiStartDt_OP,AvgDuration,NumOfPt,NumOfClm,ClmPerPt,TotalRev,RevPerPt,ClmNoPhy,ClmAllPhy,ClmNoProc,ClmPerDt
Provider,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1
PRV51001,No,4.0,5.0,0.0,0.55,Both,6.0,5.76,5.0,79.0,77.88,5.0,19.0,1.0,1.0,1.0,4185.6,5.0,0.0,0.0,no,0.0,no,0.0,no,0.0,no,11.0,0.55,yes,50.0,yes,422.0,yes,278.0,yes,1.44,24,25,1.04,109980,4582.5,0,2,23,1.09
PRV51003,Yes,4.0,5.16129,0.0,2.357143,Both,7.0,6.674242,6.0,71.0,69.083333,53.0,66.0,3.0,3.0,1.0,4588.409091,25.0,1.0,0.016129,yes,2.0,yes,2.0,yes,2.0,yes,25.0,0.357143,yes,52.0,yes,1351.0,yes,372.0,yes,3.67,117,132,1.13,671956,5743.21,0,4,93,1.19
PRV51004,No,0.0,0.0,0.0,1.42953,OP,7.0,6.812081,8.0,72.0,71.261745,0.0,138.0,0.0,9.0,1.0,350.134228,5.0,0.0,0.0,no,0.0,no,0.0,no,0.0,no,72.0,0.483221,yes,52.0,yes,3006.0,yes,375.0,yes,1.43,138,149,1.08,52480,380.29,0,12,149,1.27


## Exclude columns

In [111]:
# display(df.columns[df.columns.str.contains('DupClaim')])
# del_col = ['DupClaim_IP','DupClaim_OP','DupClaimRatio_IP','DupClaimRatio_OP',
#            'DupClaimMultiState_IP','DupClaimMultiState_OP','DupClaimMultiProvider_IP','DupClaimMultiProvider_OP',
#           'DupClaimMultiStartDt_IP','DupClaimMultiStartDt_OP']

In [88]:
# print('before delete cols', df.shape)
# df_num = df.drop(del_col, axis=1)
# print('after delete cols', df_num.shape)

before delete cols (5410, 46)
after delete cols (5410, 36)


## dummy variables

In [181]:
def make_dummy_df(df, cols) :
    for x in cols:
        dums = pd.get_dummies(df[x], drop_first=True, prefix=x)
        df = pd.concat((df, dums), axis=1)
        df.drop([x], axis=1, inplace=True)
    return df

In [182]:
dummy_list_1 = ['Serves'] # for df_num
dummy_list_2 = ['Serves','DupClaim_IP','DupClaim_OP','DupClaimRatio_IP','DupClaimRatio_OP',
           'DupClaimMultiState_IP','DupClaimMultiState_OP','DupClaimMultiProvider_IP','DupClaimMultiProvider_OP',
          'DupClaimMultiStartDt_IP','DupClaimMultiStartDt_OP'] # for df

In [183]:
df_num_dum = make_dummy_df(df_num, dummy_list_1)
df_dum = make_dummy_df(df, dummy_list_2)

In [186]:
# this is for pos_label for precision & recall for gridsearchcv
df_dum['PotentialFraud'] = np.where(df_num_dum['PotentialFraud']=='Yes', 1, 0)
df_dum['PotentialFraud'] = np.where(df_num_dum['PotentialFraud']=='Yes', 1, 0)

In [187]:
# display(df_dum.head(2))
# display(df_num_dum.head(2))

## split train-test

In [188]:
x = df_dum.drop(['PotentialFraud'], axis = 1)
y = df_dum.PotentialFraud

xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.2, random_state=4, stratify=y)

# for flag 1,0
print('ytrain class ave', ytrain.mean())
print('ytest class ave', ytest.mean())

ytrain class ave 0.09357670979667283
ytest class ave 0.0933456561922366


## Logistic Regression

In [209]:
# recall of positive - sensitivity; recall of negative - specificity
lr = LogisticRegression(random_state=4, class_weight='balanced')

# lr_params = [{'penalty': ['l1', 'l2'], "C":[0.0001,0.001,0.01,0.1,1,10,100], 'max_iter':[1000]}]
# lr_params = [{'penalty': ['elasticnet'], "C":[0.01, 0.1, 1,10,100],
#              'l1_ratio': [0,0.2,0.4,0.6, 0.8,1], 'solver': ['saga']}]

lr_params = [{'penalty': ['l1', 'l2'], "C":[0.0001,0.001,0.01,0.1,1,10,100], 
              'max_iter':[1000], 'solver':['liblinear']}] # l1=lasso
kf5 = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)
scores = { 'recall': make_scorer(recall_score, pos_label = 1, average = 'weighted'), 
          'f1': make_scorer(f1_score, pos_label = 1, average = 'weighted'),
         'precision': make_scorer(precision_score, pos_label = 1, average = 'weighted'),
         'accuracy': make_scorer(balanced_accuracy_score),
         'specificity': make_scorer(recall_score, pos_label = 0, average = 'weighted')}

# scores = ['f1_weighted', 'recall', 'precision', 'accuracy']
# lr_grid = GridSearchCV(estimator=lr, param_grid=lr_params, return_train_score=True, 
#                        cv = 5, verbose=2, scoring='recall', n_jobs=-1)

lr_grid = GridSearchCV(estimator=lr, param_grid=lr_params, return_train_score=True, 
                       cv = kf5, verbose=2, scoring=scores, refit=False, n_jobs=-1)

%time lr_grid.fit(xtrain, ytrain)

Fitting 5 folds for each of 14 candidates, totalling 70 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    6.4s


CPU times: user 670 ms, sys: 236 ms, total: 906 ms
Wall time: 15.4 s


[Parallel(n_jobs=-1)]: Done  70 out of  70 | elapsed:   15.3s finished


GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=4, shuffle=True),
             error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=4, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecat...
             pre_dispatch='2*n_jobs', refit=False, return_train_score=True,
             scoring={'accuracy': make_scorer(balanced_accuracy_score),
                      'f1': make_scorer(f1_score, pos_label=1, average=weighted),
                      'precision': make_scorer(precision_score, 

In [165]:
# best model
# display(lr_grid.best_params_)
# print('lr best validation score', lr_grid.best_score_)
# print('lr best train score', lr_grid.score(xtrain, ytrain))

In [210]:
keys = list(lr_grid.cv_results_.keys())
mean_test = [s for s in keys if 'mean_test' in s]
mean_train = [s for s in keys if 'mean_train' in s]
print(mean_test)
print(mean_train)

['mean_test_recall', 'mean_test_f1', 'mean_test_precision', 'mean_test_accuracy', 'mean_test_specificity']
['mean_train_recall', 'mean_train_f1', 'mean_train_precision', 'mean_train_accuracy', 'mean_train_specificity']


In [211]:
pd.DataFrame({'params': lr_grid.cv_results_['params'], 
              'mean_test_recall': lr_grid.cv_results_['mean_test_recall'],
              'mean_train_recall': lr_grid.cv_results_['mean_train_recall'],
              'mean_test_f1': lr_grid.cv_results_['mean_test_f1'],
              'mean_train_f1': lr_grid.cv_results_['mean_train_f1'],
              'mean_test_precision': lr_grid.cv_results_['mean_test_precision'],
              'mean_train_precision': lr_grid.cv_results_['mean_train_precision'],
              'mean_test_accuracy': lr_grid.cv_results_['mean_test_accuracy'],
              'mean_train_accuracy': lr_grid.cv_results_['mean_train_accuracy'],
              'mean_test_specificity': lr_grid.cv_results_['mean_test_specificity'],
              'mean_train_specificity': lr_grid.cv_results_['mean_train_specificity'] 
             })

Unnamed: 0,params,mean_test_recall,mean_train_recall,mean_test_f1,mean_train_f1,mean_test_precision,mean_train_precision,mean_test_accuracy,mean_train_accuracy,mean_test_specificity,mean_train_specificity
0,"{'C': 0.0001, 'max_iter': 1000, 'penalty': 'l1...",0.892101,0.892733,0.905101,0.905703,0.930753,0.931501,0.858555,0.861948,0.892101,0.892733
1,"{'C': 0.0001, 'max_iter': 1000, 'penalty': 'l2...",0.887018,0.889845,0.901167,0.903707,0.929063,0.931773,0.854644,0.86506,0.887018,0.889845
2,"{'C': 0.001, 'max_iter': 1000, 'penalty': 'l1'...",0.892562,0.89256,0.90563,0.905718,0.9318,0.932185,0.863238,0.865174,0.892562,0.89256
3,"{'C': 0.001, 'max_iter': 1000, 'penalty': 'l2'...",0.887712,0.890423,0.901691,0.904214,0.929292,0.93225,0.855027,0.866762,0.887712,0.890423
4,"{'C': 0.01, 'max_iter': 1000, 'penalty': 'l1',...",0.887019,0.888748,0.901889,0.903514,0.932656,0.93477,0.870144,0.87857,0.887019,0.888748
5,"{'C': 0.01, 'max_iter': 1000, 'penalty': 'l2',...",0.887942,0.889383,0.902064,0.903446,0.930307,0.932104,0.859582,0.866742,0.887942,0.889383
6,"{'C': 0.1, 'max_iter': 1000, 'penalty': 'l1', ...",0.890945,0.892733,0.905377,0.907059,0.936242,0.938493,0.883381,0.892393,0.890945,0.892733
7,"{'C': 0.1, 'max_iter': 1000, 'penalty': 'l2', ...",0.888404,0.891636,0.902289,0.905242,0.929723,0.933092,0.856515,0.869646,0.888404,0.891636
8,"{'C': 1, 'max_iter': 1000, 'penalty': 'l1', 's...",0.900419,0.924041,0.910781,0.932804,0.930153,0.954716,0.849857,0.942323,0.900419,0.924041
9,"{'C': 1, 'max_iter': 1000, 'penalty': 'l2', 's...",0.889329,0.890596,0.903096,0.904486,0.9305,0.933006,0.85924,0.869902,0.889329,0.890596


In [None]:
# never split data into T/F and handle them differently
# L2 is more flexible because it is smooth function, L1 is not a smooth function
# elasticnet may keep need to use the solver that works for L1, because L2 is more flexbile
# preceision expected to be 90-95%
# can change the threshold of probabilty

In [166]:
lr = LogisticRegression(random_state=4, class_weight='balanced')

lr_params = [{'penalty': ['l1'], "C":[0.0001,0.001,0.01,0.1,1,10,100], 
              'max_iter':[1000], 'solver':['liblinear']}] # l1=lasso

lr_grid = GridSearchCV(estimator=lr, param_grid=lr_params, return_train_score=True, 
                       cv = 5, verbose=2, scoring='recall', refit=True, n_jobs=-1)

%time lr_grid.fit(xtrain, ytrain)

Fitting 5 folds for each of 7 candidates, totalling 35 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  35 out of  35 | elapsed:   10.2s finished


CPU times: user 763 ms, sys: 176 ms, total: 939 ms
Wall time: 10.7 s


GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight='balanced',
                                          dual=False, fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=4, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid=[{'C': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
                          'max_iter': [1000], 'penalty': ['l1'],
                          'solver': ['liblinear']}],
             pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
             scoring='recall', verbose=2)

In [167]:
# confusion matrix
ypred = lr_grid.best_estimator_.predict(xtrain)

labels = np.unique(ytrain)
cm = confusion_matrix(ytrain, ypred, labels=labels)
cm_df = pd.DataFrame(cm, index=labels, columns=labels)
display(cm_df)

recall = recall_score(ytrain.array, ypred, average='binary', pos_label=0) # sensitivity
print('recall:', round(recall, ndigits=5))
precision = lr_grid.best_estimator_.score(xtrain, ytrain) # precision
print('precision:', round(precision, ndigits=4))
accuracy = accuracy_score(ytrain.array, ypred) # accuracy
print('accuracy', round(accuracy, 4))

from sklearn.metrics import classification_report
classification_report(ytrain.array, ypred)

f1_sco

Unnamed: 0,0,1
0,3493,430
1,43,362


recall: 0.89039
precision: 0.8907
accuracy 0.8907


'              precision    recall  f1-score   support\n\n           0       0.99      0.89      0.94      3923\n           1       0.46      0.89      0.60       405\n\n    accuracy                           0.89      4328\n   macro avg       0.72      0.89      0.77      4328\nweighted avg       0.94      0.89      0.91      4328\n'

In [59]:
# yes is pos
367/(367+45)

0.8907766990291263

In [62]:
# no is pos
3498/(3498+418)

0.8932584269662921

In [53]:
3061/(3061+369)

0.8924198250728863

In [54]:
3498/(3498+418)

0.8932584269662921

In [None]:
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]


In [25]:
# define actual
act_pos = [1 for _ in range(100)]
act_neg = [0 for _ in range(10000)]
y_true = act_pos + act_neg
# define predictions
pred_pos = [0 for _ in range(10)] + [1 for _ in range(90)]
pred_neg = [0 for _ in range(10000)]
y_pred = pred_pos + pred_neg
# calculate recall
recall = recall_score(y_true, y_pred, average='binary')
print('Recall: %.3f' % recall)

Recall: 0.900


## Random Forest

In [53]:
rf = RandomForestClassifier(random_state=4, oob_score=True)

In [105]:
rf_params = [{'max_features': range(10,40,10),'n_estimators': [100],
             'min_samples_leaf': range(1,10,2),'max_depth': range(0,36,12)}]
kf5 = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 4)

rf_grid = GridSearchCV(estimator=rf, param_grid=rf_params, return_train_score=True, 
                       cv = kf5, verbose=2, scoring='f1')
%time rf_grid.fit(xtrain, ytrain)

Fitting 5 folds for each of 45 candidates, totalling 225 fits
[CV] max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100 


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
ValueError: max_depth must be greater than zero. 

[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.1s remaining:    0.0s
ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=1, n_estimators=100, total=   0.0s
[CV] max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100, total=   0.0s
[CV] max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100 


ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=3, n_estimators=100, total=   0.0s
[CV] max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100 


ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100, total=   0.0s
[CV] max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100, total=   0.0s
[CV] max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100 


ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=10, min_samples_leaf=5, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100 


ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=7, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100 


ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 

ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100 
[CV]  max_depth=0, max_features=10, min_samples_leaf=9, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=20, min_samples_leaf=1, n_estimators=100 
[CV]  max_depth=0, max_features=20, min_samples_leaf=1, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=20, min_samples_leaf=1, n_estimators=100 


ValueError: max_depth must be greater than zero. 



[CV]  max_depth=0, max_features=20, min_samples_leaf=1, n_estimators=100, total=   0.1s
[CV] max_depth=0, max_features=20, min_samples_leaf=1, n_estimators=100 


KeyboardInterrupt: 

In [98]:
# second try
display(rf_grid.best_estimator_)
print('RF best validation score', rf_grid.best_score_)
print('RF best train score', rf_grid.score(xtrain, ytrain))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=12, max_features=30,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=4, verbose=0,
                       warm_start=False)

RF best validation score 0.7589387536376049
RF best train score 0.8799832898638397


In [63]:
# first try
display(rf_grid.best_estimator_)
print('RF best validation score', rf_grid.best_score_)
print('RF best train score', rf_grid.score(xtrain, ytrain))

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features=30,
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=5, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=True, random_state=4, verbose=0,
                       warm_start=False)

RF best validation score 0.9396950967173504
RF best train score 0.9743530499075785


In [None]:
# recall & precision


In [None]:
# sklearn.feature_selection import selectFromModel
# compute_class_weight