In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import seaborn as sns
import random
sns.set()

In [2]:
with open('clean_data/df.pkl', 'rb') as f:
        df = pickle.load(f)
with open('clean_data/df_labels.pkl', 'rb') as f:
        df_labels = pickle.load(f)

In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,level_4_diag_1_na,level_5_diag_1_na,level_2_diag_2_na,level_3_diag_2_na,level_4_diag_2_na,level_5_diag_2_na,level_2_diag_3_na,level_3_diag_3_na,level_4_diag_3_na,level_5_diag_3_na
0,2278392,8222157,3,1,0,0,0,1,1,0,...,0,0,1,1,1,1,1,1,1,1
1,149190,55629189,3,1,1,1,1,7,3,0,...,0,0,0,0,0,0,0,0,0,0
2,64410,86047875,1,1,2,1,1,7,2,0,...,0,0,0,0,0,0,0,0,0,0
3,500364,82442376,3,2,3,1,1,7,2,0,...,0,0,0,0,0,0,0,0,0,0
4,16680,42519267,3,2,4,1,1,7,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df['readmitted'].value_counts().sort_index()

0    54864
1    11357
2    35545
Name: readmitted, dtype: int64

In [5]:
df['readmitted'] = (df['readmitted'] == 1).astype(int)
df['readmitted'].mean()

0.11159915885462728

In [6]:
def data_split(df, y_col, to_drop=[], random_state=None, hold1_size=.1, hold2_size=.1, hold3_size=.1):
    df_filtered = df.drop(columns=to_drop)
    rows = list(df_filtered.index)
    if random_state is not None:
        random.seed(random_state)
    random.shuffle(rows)
    length = len(rows)
    train_rows = rows[:int(length*.7)]
    hold1_rows = rows[int(length*.7):int(length*.8)]
    hold2_rows = rows[int(length*.8):int(length*.9)]
    hold3_rows = rows[int(length*.9):]
    X_train = df_filtered.drop(columns=[y_col]).iloc[train_rows].values
    y_train = df_filtered.loc[train_rows, y_col].values
    X_hold1 = df_filtered.drop(columns=[y_col]).iloc[hold1_rows].values
    y_hold1 = df_filtered.loc[hold1_rows, y_col].values
    X_hold2 = df_filtered.drop(columns=[y_col]).iloc[hold2_rows].values
    y_hold2 = df_filtered.loc[hold2_rows, y_col].values
    X_hold3 = df_filtered.drop(columns=[y_col]).iloc[hold3_rows].values
    y_hold3 = df_filtered.loc[hold3_rows, y_col].values
    cols = df_filtered.drop(columns=[y_col]).columns
    return X_train, y_train, X_hold1, y_hold1, X_hold2, y_hold2, X_hold3, y_hold3, cols
    
data_split(df, 'readmitted', ['encounter_id', 'patient_nbr', 'payer_code'], 42)

(array([[0, 1, 4, ..., 0, 0, 0],
        [3, 2, 7, ..., 0, 0, 0],
        [1, 1, 5, ..., 0, 0, 0],
        ...,
        [5, 2, 4, ..., 1, 1, 1],
        [3, 2, 7, ..., 0, 0, 0],
        [3, 2, 7, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 0, 0, 1]),
 array([[3, 2, 6, ..., 0, 0, 0],
        [3, 2, 5, ..., 0, 0, 0],
        [3, 1, 6, ..., 0, 0, 0],
        ...,
        [3, 2, 6, ..., 0, 0, 0],
        [3, 1, 7, ..., 0, 0, 0],
        [3, 2, 6, ..., 0, 0, 0]]),
 array([1, 0, 0, ..., 0, 0, 0]),
 array([[3, 2, 7, ..., 0, 0, 0],
        [3, 1, 4, ..., 0, 0, 0],
        [3, 2, 5, ..., 0, 0, 0],
        ...,
        [3, 2, 5, ..., 0, 0, 0],
        [1, 1, 6, ..., 0, 0, 0],
        [3, 2, 8, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([[3, 1, 8, ..., 0, 0, 0],
        [4, 2, 5, ..., 0, 0, 0],
        [3, 1, 7, ..., 0, 0, 0],
        ...,
        [1, 1, 4, ..., 0, 0, 0],
        [1, 2, 6, ..., 0, 0, 0],
        [3, 1, 7, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 0, 0, 0]),
 Index(['race', 

## First try with random forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
X_train, y_train, X_hold1, y_hold1, X_hold2, y_hold2, X_hold3, y_hold3, cols = data_split(
    df=df, 
    y_col='readmitted',
    to_drop=['encounter_id', 'patient_nbr', 'payer_code'], 
    random_state=42,
    hold1_size=.1, 
    hold2_size=.1, 
    hold3_size=.1)
print(X_train.shape)
print(y_train.shape)

(71236, 83)
(71236,)


In [8]:
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_hold1, clf.predict(X_hold1))



array([[8994,   49],
       [1109,   24]])

In [9]:
from sklearn.metrics import recall_score
recall_score(y_hold1, clf.predict(X_hold1))  

0.02118270079435128

In [10]:
from sklearn.metrics import precision_score
precision_score(y_hold1, clf.predict(X_hold1))

0.3287671232876712

## Undersampling to deal with imbalanced classes

In [11]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
X_rus, y_rus, id_rus = rus.fit_sample(X_train, y_train)
print(X_rus.shape)
print(y_rus.shape)

(16048, 83)
(16048,)


In [12]:
clf_rus = RandomForestClassifier(n_jobs=-1, max_features=10, random_state=42, n_estimators=100)
clf_rus.fit(X_rus, y_rus)
confusion_matrix(y_hold1, clf_rus.predict(X_hold1))

array([[5545, 3498],
       [ 463,  670]])

In [13]:
recall_score(y_hold1, clf_rus.predict(X_hold1))

0.5913503971756399

In [14]:
precision_score(y_hold1, clf_rus.predict(X_hold1))

0.16074856046065258

## Optimizing for recall

In [None]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 20, 40, 60, 100, 200, 500], 
              'max_depth':[2, 5, 10, 20, 40, None],
              'min_samples_split':[2, 3, 5, 7, 10, 50],
              'max_features':[2, 5, 7, 10, 15, 20, 25, 30]}

best_params={'n_esmitators':10,
             'max_depth':2,
             'min_samples_split':2,
             'max_features':2}
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200, 500]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = recall_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [15]:
clf = RandomForestClassifier(n_estimators=20,
                                           max_depth=10,
                                           min_samples_split=5,
                                           max_features=2,
                                           n_jobs=-1, random_state=42)
clf.fit(X_rus, y_rus)
recall_score(y_hold2, clf.predict(X_hold2))

0.6154521510096576

In [16]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5108, 3930],
       [ 438,  701]])

In [17]:
precision_score(y_hold2, clf.predict(X_hold2))

0.15137119412653854

## Optimizing for precision

In [None]:
best_params={'n_esmitators':10,
             'max_depth':2,
             'min_samples_split':2,
             'max_features':2}
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200, 500]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = precision_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [18]:
clf = RandomForestClassifier(n_estimators=500,
                                           max_depth=2,
                                           min_samples_split=2,
                                           max_features=2,
                                           n_jobs=-1, random_state=42)
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.14993423936869793

In [19]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5160, 3878],
       [ 455,  684]])

In [20]:
recall_score(y_hold2, clf.predict(X_hold2))

0.6005267778753293

## Trying SVM

In [21]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.11759448416751787

In [22]:
recall_score(y_hold2, clf.predict(X_hold2))

0.8086040386303776

In [23]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[2127, 6911],
       [ 218,  921]])

## Trying AdaBoost

In [24]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier())
ada_clf.fit(X_rus, y_rus)
precision_score(y_hold2, ada_clf.predict(X_hold2))

0.13399872584412825

In [25]:
bdt_real = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1)
bdt_real.fit(X_rus, y_rus)
precision_score(y_hold2, bdt_real.predict(X_hold2))

0.15790714117369786

In [26]:
confusion_matrix(y_hold2, bdt_real.predict(X_hold2))

array([[5465, 3573],
       [ 469,  670]])

## Trying GradientBoostingClassifier

In [27]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_rus, y_rus)
precision_score(y_hold2, gbc.predict(X_hold2))

0.17506426735218508

In [None]:
for n_estimators in [100, 200, 500, 800]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=GradientBoostingClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = precision_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [28]:
gbc = GradientBoostingClassifier(n_estimators=100,
                                           max_depth=2,
                                           min_samples_split=2,
                                           max_features=15,
                                           random_state=42)
gbc.fit(X_rus, y_rus)
precision_score(y_hold2, gbc.predict(X_hold2))

0.17766203703703703

In [29]:
confusion_matrix(y_hold2, gbc.predict(X_hold2))

array([[6196, 2842],
       [ 525,  614]])

In [30]:
precision_score(y_rus, gbc.predict(X_rus))

0.6411490230387868

## Trying oversampling

In [31]:
from imblearn.over_sampling import SMOTE, ADASYN
X_smote, y_smote = SMOTE().fit_resample(X_train, y_train)
clf_smote = RandomForestClassifier(n_jobs=-1, random_state=42)
clf_smote.fit(X_smote, y_smote)
confusion_matrix(y_hold1, clf_smote.predict(X_hold1))



array([[8552,  491],
       [1039,   94]])

In [32]:
precision_score(y_hold2, clf_smote.predict(X_hold2))

0.18114874815905743

In [33]:
X_adasyn, y_adasyn = ADASYN().fit_resample(X_train, y_train)
clf_adasyn = RandomForestClassifier(n_jobs=-1, random_state=42)
clf_adasyn.fit(X_adasyn, y_adasyn)
confusion_matrix(y_hold1, clf_adasyn.predict(X_hold1))



array([[8523,  520],
       [1018,  115]])

In [34]:
precision_score(y_hold2, clf_adasyn.predict(X_hold2))

0.1716867469879518

## Trying RandomForest without oversampling

In [35]:
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))



0.3392857142857143

In [36]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[9001,   37],
       [1120,   19]])

In [37]:
recall_score(y_hold2, clf.predict(X_hold2))

0.016681299385425813

In [None]:
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           min_samples_leaf=10,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_train, y_train)
                current_score = precision_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [38]:
clf = RandomForestClassifier(n_estimators=20,
                             max_depth=20,
                             min_samples_split=50,
                             max_features=5,
                             min_samples_leaf=10,
                             n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))

  'precision', 'predicted', average, warn_for)


0.0

In [39]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[9038,    0],
       [1139,    0]])

In [40]:
precision_score(y_hold1, clf.predict(X_hold1))

1.0

In [41]:
confusion_matrix(y_hold1, clf.predict(X_hold1))

array([[9043,    0],
       [1132,    1]])

## Random Forest using F1 score for optimization

In [None]:
from sklearn.metrics import f1_score
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           min_samples_leaf=10,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_train, y_train)
                current_score = f1_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [42]:
clf = RandomForestClassifier(n_estimators=10,
                             max_depth=40,
                             min_samples_split=2,
                             max_features=2,
                             n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))

0.21739130434782608

In [43]:
precision_score(y_train, clf.predict(X_train))

0.9995483288166215

In [44]:
confusion_matrix(y_train, clf.predict(X_train))

array([[63209,     3],
       [ 1385,  6639]])

In [45]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[9020,   18],
       [1134,    5]])

In [46]:
from sklearn.metrics import matthews_corrcoef
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.015922284536503992

In [47]:
clf = RandomForestClassifier(n_estimators=2,
                             max_depth=5,
                             min_samples_split=2,
                             max_features=5,
                             n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))

0.42857142857142855

In [48]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[9030,    8],
       [1133,    6]])

In [49]:
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.03727842684846495

In [50]:
precision_score(y_train, clf.predict(X_train))

0.5769230769230769

## Optimizing Matthews correlation coefficient

In [None]:
best_score=0
for n_estimators in [2, 5, 10, 20, 50, 100]:
    for max_depth in [2, 5, 10, 20, 50, 100, 150, None]:
        for max_features in [2, 5, 10, 20, 50, 83]:
            clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           n_jobs=-1,
                                           random_state=42)
            clf.fit(X_train, y_train)
            current_score = matthews_corrcoef(y_hold1, clf.predict(X_hold1))
            if current_score > best_score:
                best_params = {'n_estimators':n_estimators,
                                'max_depth':max_depth,
                                'max_features':max_features}
                best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [51]:
clf = RandomForestClassifier(n_estimators=20,
                             max_depth=20,
                             max_features=83,
                             n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))

0.46601941747572817

In [52]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[8983,   55],
       [1091,   48]])

In [53]:
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.11357056366290935

In [54]:
precision_score(y_train, clf.predict(X_train))

0.9996819338422391

In [55]:
matthews_corrcoef(y_train, clf.predict(X_train))

0.6028952085720837

## Random Forest, oversampling, Matthews correlation coefficient

In [None]:
best_score=0
for n_estimators in [2, 5, 10, 20, 50, 100]:
    for max_depth in [2, 5, 10, 20, 50, 100, 150, None]:
        for max_features in [2, 5, 10, 20, 50, 83]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           n_jobs=-1, 
                                           random_state=42)
                clf.fit(X_smote, y_smote)
                current_score = matthews_corrcoef(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [56]:
clf = RandomForestClassifier(n_estimators=50,
                             max_depth=20,
                             max_features=50,
                             n_jobs=-1, random_state=42)
clf.fit(X_smote, y_smote)
precision_score(y_hold2, clf.predict(X_hold2))

0.18094089264173704

In [57]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[8359,  679],
       [ 989,  150]])

In [58]:
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.06519680491741443

In [59]:
matthews_corrcoef(y_smote, clf.predict(X_smote))

0.9384119934482136

## Random Forest, undersampling, Matthews correlation coefficient

In [None]:
best_score=0
for n_estimators in [2, 5, 10, 20, 50, 100]:
    for max_depth in [2, 5, 10, 20, 50, 100, 150, None]:
        for max_features in [2, 5, 10, 20, 50, 83]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           n_jobs=-1, 
                                           random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = matthews_corrcoef(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [60]:
clf = RandomForestClassifier(n_estimators=50,
                             max_depth=5,
                             max_features=10,
                             n_jobs=-1, 
                             random_state=42)
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.16999733546496137

In [61]:
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.14080632813207222

In [62]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5923, 3115],
       [ 501,  638]])

In [63]:
matthews_corrcoef(y_rus, clf.predict(X_rus))

0.255477356257321

## Random Forest, undersampling, optimizing for Fbeta score

In [None]:
from sklearn.metrics import fbeta_score
best_score=0
for n_estimators in [2, 5, 10, 20, 50, 100]:
    for max_depth in [2, 5, 10, 20, 50, 100, 150, None]:
        for max_features in [2, 5, 10, 20, 50, 83]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           n_jobs=-1, 
                                           random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = fbeta_score(y_hold1, clf.predict(X_hold1), beta=2)
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [74]:
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=10,
                             max_features=83,
                             n_jobs=-1, 
                             random_state=42)
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.1657329598506069

In [75]:
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.14553683888733207

In [76]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5464, 3574],
       [ 429,  710]])

In [77]:
matthews_corrcoef(y_rus, clf.predict(X_rus))

0.5053606539587217

In [78]:
values = sorted(zip(cols, clf.feature_importances_), key=lambda x: x[1] * -1)
pd.DataFrame(values, columns=['variable', 'feature_importance'])

Unnamed: 0,variable,feature_importance
0,number_inpatient,0.177112
1,discharge_disposition_id,0.125422
2,num_lab_procedures,0.069210
3,num_medications,0.047857
4,diag_1,0.038710
5,time_in_hospital,0.038270
6,diag_3,0.032653
7,diag_2,0.030556
8,age,0.029556
9,medical_specialty,0.029315


In [82]:
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=10,
                             max_features=83,
                             n_jobs=-1, 
                             random_state=42,
                             oob_score=True)
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))
oob_error = 1 - clf.oob_score_
oob_error

0.38210368893320035

In [80]:
clf

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=10, max_features=83, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
            oob_score=False, random_state=42, verbose=0, warm_start=False)

In [None]:
from sklearn.metrics import fbeta_score
best_score=0
for n_estimators in [2, 5, 10, 20, 50, 100]:
    for max_depth in [2, 5, 10, 20, 50, 100, 150, None]:
        for max_features in [2, 5, 10, 20, 50, 83]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           n_jobs=-1, 
                                           random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = fbeta_score(y_hold1, clf.predict(X_hold1), beta=1.5)
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

## GradientBoostingClassifier, undersampling, Matthews Correlation Coefficient

In [None]:
best_score=0
for n_estimators in [100, 200, 500, 800]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
            clf=GradientBoostingClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           random_state=42)
            clf.fit(X_rus, y_rus)
            current_score = matthews_corrcoef(y_hold1, clf.predict(X_hold1))
            if current_score > best_score:
                best_params = {'n_estimators':n_estimators,
                                'max_depth':max_depth,
                                'min_samples_split':min_samples_split,
                                'max_features':max_features}
                best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [70]:
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=2,
                             max_features=25,
                             n_jobs=-1, 
                             random_state=42)
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.16561557444474911

In [71]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5995, 3043],
       [ 535,  604]])

In [72]:
matthews_corrcoef(y_hold2, clf.predict(X_hold2))

0.127285621689267

In [73]:
matthews_corrcoef(y_rus, clf.predict(X_rus))

0.2145503711827032

## Resplitting the data with just one holdout set

In [87]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.drop(['encounter_id', 'patient_nbr', 'payer_code', 'readmitted'],
                                                           axis=1), 
                                                    df[['readmitted']], 
                                                    test_size=0.1, 
                                                    random_state=42)
rus = RandomUnderSampler(return_indices=True)
X_rus, y_rus, id_rus = rus.fit_sample(X_train, y_train)

In [89]:
from sklearn.metrics import fbeta_score
best_score=0
for n_estimators in [2, 5, 10, 20, 50, 100]:
    for max_depth in [2, 5, 10, 20, 50, 100, 150, None]:
        for max_features in [2, 5, 10, 20, 50, 83]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           max_features=max_features,
                                           n_jobs=-1, 
                                           random_state=42,
                                           oob_score=True)
                clf.fit(X_rus, y_rus)
                current_score = clf.oob_score_
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'max_features':max_features}
                    best_score = current_score

  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inp

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scor

  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  warn("Some inputs do not have OOB scores. "
  predictions[k].sum(axis=1)[:, np.newaxis])
  predictions[k].sum(axis=1)[:, np.newaxis])
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0]

  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':
  if sys.path[0] == '':


In [90]:
best_params

{'n_estimators': 100, 'max_depth': 10, 'max_features': 50}

In [91]:
best_score

0.6221004208671822

It appears to be the out-of-bag accuracy score. Is this OK to use since the dataset is rebalanced using undersampling?