In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import pickle
import seaborn as sns
import random
sns.set()

In [2]:
with open('clean_data/df.pkl', 'rb') as f:
        df = pickle.load(f)
with open('clean_data/df_labels.pkl', 'rb') as f:
        df_labels = pickle.load(f)

In [3]:
df.head()

Unnamed: 0,encounter_id,patient_nbr,race,gender,age,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,payer_code,...,level_4_diag_1_na,level_5_diag_1_na,level_2_diag_2_na,level_3_diag_2_na,level_4_diag_2_na,level_5_diag_2_na,level_2_diag_3_na,level_3_diag_3_na,level_4_diag_3_na,level_5_diag_3_na
0,2278392,8222157,3,1,0,0,0,1,1,0,...,0,0,1,1,1,1,1,1,1,1
1,149190,55629189,3,1,1,1,1,7,3,0,...,0,0,0,0,0,0,0,0,0,0
2,64410,86047875,1,1,2,1,1,7,2,0,...,0,0,0,0,0,0,0,0,0,0
3,500364,82442376,3,2,3,1,1,7,2,0,...,0,0,0,0,0,0,0,0,0,0
4,16680,42519267,3,2,4,1,1,7,1,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
df['readmitted'].value_counts().sort_index()

0    54864
1    11357
2    35545
Name: readmitted, dtype: int64

In [5]:
df['readmitted'] = (df['readmitted'] == 1).astype(int)
df['readmitted'].mean()

0.11159915885462728

In [6]:
def data_split(df, y_col, to_drop=[], random_state=None, hold1_size=.1, hold2_size=.1, hold3_size=.1):
    df_filtered = df.drop(columns=to_drop)
    rows = list(df_filtered.index)
    if random_state is not None:
        random.seed(random_state)
    random.shuffle(rows)
    length = len(rows)
    train_rows = rows[:int(length*.7)]
    hold1_rows = rows[int(length*.7):int(length*.8)]
    hold2_rows = rows[int(length*.8):int(length*.9)]
    hold3_rows = rows[int(length*.9):]
    X_train = df_filtered.drop(columns=[y_col]).iloc[train_rows].values
    y_train = df_filtered.loc[train_rows, y_col].values
    X_hold1 = df_filtered.drop(columns=[y_col]).iloc[hold1_rows].values
    y_hold1 = df_filtered.loc[hold1_rows, y_col].values
    X_hold2 = df_filtered.drop(columns=[y_col]).iloc[hold2_rows].values
    y_hold2 = df_filtered.loc[hold2_rows, y_col].values
    X_hold3 = df_filtered.drop(columns=[y_col]).iloc[hold3_rows].values
    y_hold3 = df_filtered.loc[hold3_rows, y_col].values
    return X_train, y_train, X_hold1, y_hold1, X_hold2, y_hold2, X_hold3, y_hold3
    
data_split(df, 'readmitted', ['encounter_id', 'patient_nbr', 'payer_code'], 42)

(array([[0, 1, 4, ..., 0, 0, 0],
        [3, 2, 7, ..., 0, 0, 0],
        [1, 1, 5, ..., 0, 0, 0],
        ...,
        [5, 2, 4, ..., 1, 1, 1],
        [3, 2, 7, ..., 0, 0, 0],
        [3, 2, 7, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 0, 0, 1]),
 array([[3, 2, 6, ..., 0, 0, 0],
        [3, 2, 5, ..., 0, 0, 0],
        [3, 1, 6, ..., 0, 0, 0],
        ...,
        [3, 2, 6, ..., 0, 0, 0],
        [3, 1, 7, ..., 0, 0, 0],
        [3, 2, 6, ..., 0, 0, 0]]),
 array([1, 0, 0, ..., 0, 0, 0]),
 array([[3, 2, 7, ..., 0, 0, 0],
        [3, 1, 4, ..., 0, 0, 0],
        [3, 2, 5, ..., 0, 0, 0],
        ...,
        [3, 2, 5, ..., 0, 0, 0],
        [1, 1, 6, ..., 0, 0, 0],
        [3, 2, 8, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 0, 0, 0]),
 array([[3, 1, 8, ..., 0, 0, 0],
        [4, 2, 5, ..., 0, 0, 0],
        [3, 1, 7, ..., 0, 0, 0],
        ...,
        [1, 1, 4, ..., 0, 0, 0],
        [1, 2, 6, ..., 0, 0, 0],
        [3, 1, 7, ..., 0, 0, 0]]),
 array([0, 0, 0, ..., 0, 0, 0]))

## First try with random forest

In [7]:
from sklearn.ensemble import RandomForestClassifier
X_train, y_train, X_hold1, y_hold1, X_hold2, y_hold2, X_hold3, y_hold3 = data_split(
    df=df, 
    y_col='readmitted',
    to_drop=['encounter_id', 'patient_nbr', 'payer_code'], 
    random_state=42,
    hold1_size=.1, 
    hold2_size=.1, 
    hold3_size=.1)
print(X_train.shape)
print(y_train.shape)

(71236, 83)
(71236,)


In [8]:
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
from sklearn.metrics import confusion_matrix
confusion_matrix(y_hold1, clf.predict(X_hold1))



array([[8994,   49],
       [1109,   24]])

In [9]:
from sklearn.metrics import recall_score
recall_score(y_hold1, clf.predict(X_hold1))  

0.02118270079435128

In [10]:
from sklearn.metrics import precision_score
precision_score(y_hold1, clf.predict(X_hold1))

0.3287671232876712

## Undersampling to deal with imbalanced classes

In [11]:
from imblearn.under_sampling import RandomUnderSampler
rus = RandomUnderSampler(return_indices=True)
X_rus, y_rus, id_rus = rus.fit_sample(X_train, y_train)
print(X_rus.shape)
print(y_rus.shape)

(16048, 83)
(16048,)


In [12]:
clf_rus = RandomForestClassifier(n_jobs=-1, max_features=10, random_state=42, n_estimators=100)
clf_rus.fit(X_rus, y_rus)
confusion_matrix(y_hold1, clf_rus.predict(X_hold1))

array([[5503, 3540],
       [ 464,  669]])

In [13]:
recall_score(y_hold1, clf_rus.predict(X_hold1))

0.5904677846425419

In [14]:
precision_score(y_hold1, clf_rus.predict(X_hold1))

0.15894511760513186

## Optimizing for recall

In [15]:
from sklearn.model_selection import GridSearchCV
parameters = {'n_estimators':[10, 20, 40, 60, 100, 200, 500], 
              'max_depth':[2, 5, 10, 20, 40, None],
              'min_samples_split':[2, 3, 5, 7, 10, 50],
              'max_features':[2, 5, 7, 10, 15, 20, 25, 30]}

best_params={'n_esmitators':10,
             'max_depth':2,
             'min_samples_split':2,
             'max_features':2}
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200, 500]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = recall_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

KeyboardInterrupt: 

In [None]:
best_params

In [None]:
best_score

In [16]:
clf = RandomForestClassifier(n_estimators=20,
                                           max_depth=10,
                                           min_samples_split=5,
                                           max_features=2,
                                           n_jobs=-1, random_state=42)
clf.fit(X_rus, y_rus)
recall_score(y_hold2, clf.predict(X_hold2))

0.5996488147497805

In [17]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5175, 3863],
       [ 456,  683]])

In [18]:
precision_score(y_hold2, clf.predict(X_hold2))

0.15024197096348438

## Optimizing for precision

In [None]:
best_params={'n_esmitators':10,
             'max_depth':2,
             'min_samples_split':2,
             'max_features':2}
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200, 500]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = precision_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [19]:
clf = RandomForestClassifier(n_estimators=500,
                                           max_depth=2,
                                           min_samples_split=2,
                                           max_features=2,
                                           n_jobs=-1, random_state=42)
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.1437837837837838

In [20]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[5078, 3960],
       [ 474,  665]])

In [21]:
recall_score(y_hold2, clf.predict(X_hold2))

0.5838454784899034

## Trying SVM

In [22]:
from sklearn.svm import SVC
clf = SVC(gamma='auto')
clf.fit(X_rus, y_rus)
precision_score(y_hold2, clf.predict(X_hold2))

0.1159235668789809

In [23]:
recall_score(y_hold2, clf.predict(X_hold2))

0.7989464442493416

In [24]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[2098, 6940],
       [ 229,  910]])

## Trying AdaBoost

In [25]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
ada_clf = AdaBoostClassifier(DecisionTreeClassifier())
ada_clf.fit(X_rus, y_rus)
precision_score(y_hold2, ada_clf.predict(X_hold2))

0.13733998698199176

In [26]:
bdt_real = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=2),
    n_estimators=600,
    learning_rate=1)
bdt_real.fit(X_rus, y_rus)
precision_score(y_hold2, bdt_real.predict(X_hold2))

0.15231166392865525

In [28]:
confusion_matrix(y_hold2, bdt_real.predict(X_hold2))

array([[5426, 3612],
       [ 490,  649]])

## Trying GradientBoostingClassifier

In [29]:
from sklearn.ensemble import GradientBoostingClassifier
gbc = GradientBoostingClassifier()
gbc.fit(X_rus, y_rus)
precision_score(y_hold2, gbc.predict(X_hold2))

0.1714503429006858

In [None]:
for n_estimators in [100, 200, 500, 800]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=GradientBoostingClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           random_state=42)
                clf.fit(X_rus, y_rus)
                current_score = precision_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [30]:
gbc = GradientBoostingClassifier(n_estimators=100,
                                           max_depth=2,
                                           min_samples_split=2,
                                           max_features=15,
                                           random_state=42)
gbc.fit(X_rus, y_rus)
precision_score(y_hold2, gbc.predict(X_hold2))

0.1776061776061776

## Trying oversampling

In [31]:
from imblearn.over_sampling import SMOTE, ADASYN
X_smote, y_smote = SMOTE().fit_resample(X_train, y_train)
clf_smote = RandomForestClassifier(n_jobs=-1, random_state=42)
clf_smote.fit(X_smote, y_smote)
confusion_matrix(y_hold1, clf_smote.predict(X_hold1))



array([[8547,  496],
       [1040,   93]])

In [32]:
precision_score(y_hold2, clf_smote.predict(X_hold2))

0.16891891891891891

In [33]:
X_adasyn, y_adasyn = ADASYN().fit_resample(X_train, y_train)
clf_adasyn = RandomForestClassifier(n_jobs=-1, random_state=42)
clf_adasyn.fit(X_adasyn, y_adasyn)
confusion_matrix(y_hold1, clf_adasyn.predict(X_hold1))



array([[8498,  545],
       [1026,  107]])

In [34]:
precision_score(y_hold2, clf_adasyn.predict(X_hold2))

0.15944272445820434

## Trying RandomForest without oversampling

In [35]:
clf = RandomForestClassifier(n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))



0.3392857142857143

In [36]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[9001,   37],
       [1120,   19]])

In [37]:
recall_score(y_hold2, clf.predict(X_hold2))

0.016681299385425813

In [None]:
best_score=0
for n_estimators in [10, 20, 40, 60, 100, 200]:
    for max_depth in [2, 5, 10, 20, 40, None]:
        for min_samples_split in [2, 3, 5, 7, 10, 50]:
            for max_features in [2, 5, 7, 10, 15, 20, 25, 30]:
                clf=RandomForestClassifier(n_estimators=n_estimators,
                                           max_depth=max_depth,
                                           min_samples_split=min_samples_split,
                                           max_features=max_features,
                                           min_samples_leaf=10,
                                           n_jobs=-1, random_state=42)
                clf.fit(X_train, y_train)
                current_score = precision_score(y_hold1, clf.predict(X_hold1))
                if current_score > best_score:
                    best_params = {'n_estimators':n_estimators,
                                   'max_depth':max_depth,
                                   'min_samples_split':min_samples_split,
                                   'max_features':max_features}
                    best_score = current_score

In [None]:
best_params

In [None]:
best_score

In [38]:
clf = RandomForestClassifier(n_estimators=20,
                             max_depth=20,
                             min_samples_split=50,
                             max_features=5,
                             min_samples_leaf=10,
                             n_jobs=-1, random_state=42)
clf.fit(X_train, y_train)
precision_score(y_hold2, clf.predict(X_hold2))

  'precision', 'predicted', average, warn_for)


0.0

In [39]:
confusion_matrix(y_hold2, clf.predict(X_hold2))

array([[9038,    0],
       [1139,    0]])

In [40]:
precision_score(y_hold1, clf.predict(X_hold1))

1.0

In [41]:
confusion_matrix(y_hold1, clf.predict(X_hold1))

array([[9043,    0],
       [1132,    1]])