# CAPSTONE PROJECT - PART 2
__Michael Gat__  
__General Assembly Santa Monica, Data Science Immersive, Summer 2016__

In this notebook, we'll build the initial model using a subset of the data that includes only the commonly-used fields. This will be simpler but will exclude some information that might be useful. We will come back to those fields in __Part 3__ once the model is working and fairly robust.

In [70]:
import pandas as pd
import numpy as np
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.cross_validation import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler

## READ IN DATA
We have a clean dataset from __Part 1__. We'll import that, then select only the columns we want to deal with at this time.

In [71]:
df = pd.read_csv('diabetic_data_clean.csv')

In [72]:
df.shape

(101766, 62)

In [73]:
df.columns

Index([u'race', u'gender', u'admission_type_id', u'discharge_disposition_id',
       u'admission_source_id', u'time_in_hospital', u'num_lab_procedures',
       u'num_procedures', u'num_medications', u'number_outpatient',
       u'number_emergency', u'number_inpatient', u'number_diagnoses',
       u'max_glu_serum', u'A1Cresult', u'metformin', u'repaglinide',
       u'nateglinide', u'chlorpropamide', u'glimepiride', u'acetohexamide',
       u'glipizide', u'glyburide', u'tolbutamide', u'pioglitazone',
       u'rosiglitazone', u'acarbose', u'miglitol', u'troglitazone',
       u'tolazamide', u'examide', u'citoglipton', u'insulin',
       u'glyburide-metformin', u'glipizide-metformin',
       u'glimepiride-pioglitazone', u'metformin-rosiglitazone',
       u'metformin-pioglitazone', u'change', u'diabetesMed', u'age_group',
       u'readmit', u'428', u'276', u'414', u'401', u'427', u'599', u'496',
       u'403', u'486', u'786', u'780', u'491', u'410', u'682', u'584', u'585',
       u'707', u'5

### Extract key features from the larger dataset
A number of the features in the dataset are rather sparsely populated. While thse may ultimately provide interesting information, we'll start by building a model that uses features that exist consistently throughout the dataset. To that end, we'll elminiate the information about drugs administered (except for the generic "diabetes med") and the specific diagnosis codes. We'll come back to these later once we have a working model.

In [74]:
df_model_1 = df.ix[:,0:15]
df_model_1 = df_model_1.join(df.ix[:,['change', 'diabetesMed', 'age_group', 'readmit']])
df_model_1.head()

Unnamed: 0,race,gender,admission_type_id,discharge_disposition_id,admission_source_id,time_in_hospital,num_lab_procedures,num_procedures,num_medications,number_outpatient,number_emergency,number_inpatient,number_diagnoses,max_glu_serum,A1Cresult,change,diabetesMed,age_group,readmit
0,1,0,6,25,1,1,41,0,1,0,0,0,1,0,0,0,0,0,0
1,1,0,1,1,7,3,59,0,18,0,0,0,9,0,0,1,1,1,0
2,2,0,1,1,7,2,11,5,13,2,0,1,6,0,0,0,1,2,0
3,1,1,1,1,7,2,44,1,16,0,0,0,7,0,0,1,1,3,0
4,1,1,1,1,7,1,51,0,8,0,0,0,5,0,0,1,1,4,0


In [75]:
df_model_1.dtypes

race                        int64
gender                      int64
admission_type_id           int64
discharge_disposition_id    int64
admission_source_id         int64
time_in_hospital            int64
num_lab_procedures          int64
num_procedures              int64
num_medications             int64
number_outpatient           int64
number_emergency            int64
number_inpatient            int64
number_diagnoses            int64
max_glu_serum               int64
A1Cresult                   int64
change                      int64
diabetesMed                 int64
age_group                   int64
readmit                     int64
dtype: object

In [76]:
df_model_1.shape

(101766, 19)

## BUILD FIRST (SIMPLER) MODEL
Just test a handful of models, prior to doing any kind of feature selection.

### Split the data into test and train sets

In [77]:
X = df_model_1.ix[:,0:17]
y = df_model_1.ix[:,18]

X_std = StandardScaler().fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.75)

### Apply the Chi Squared feature selector, choosing to go with six features

In [78]:
ch2 = SelectKBest(chi2, k=6)
X_train_fit = ch2.fit_transform(X_train, y_train)

### Identify top selected features

In [79]:
col_indices = ch2.get_support(indices=True)
for i in col_indices:
    print X_train.columns.values[i]

discharge_disposition_id
time_in_hospital
num_medications
number_outpatient
number_emergency
number_inpatient


In [80]:
ch2.scores_

array([  2.13306166e-01,   1.09527880e-01,   1.50421052e+00,
         4.38905409e+02,   1.72059128e-01,   8.32041511e+01,
         4.90788270e+01,   5.40893239e+00,   1.07620575e+02,
         5.95318410e+01,   5.66649654e+02,   1.79316259e+03,
         2.62778389e+01,   1.34245559e+00,   1.78533261e+00,
         1.60376627e+00,   3.86808491e+00])

### Transform/Fit

In [81]:
X_train_fit

array([[ 3,  7, 16,  0,  0,  0],
       [ 1,  2,  5,  0,  0,  0],
       [ 1,  4, 13,  0,  0,  1],
       ..., 
       [ 6, 13, 28,  0,  0,  0],
       [ 1,  4, 13,  0,  0,  0],
       [ 1, 12, 21,  0,  0,  1]])

In [82]:
X_test_xform = ch2.transform(X_test)
X_test_xform

array([[18,  5, 10,  0,  0,  1],
       [ 6,  8, 20,  0,  1,  0],
       [ 1,  4, 22,  1,  0,  1],
       ..., 
       [ 4,  3, 15,  0,  0,  1],
       [ 3,  6, 12,  0,  0,  0],
       [ 2,  5, 15,  0,  0,  0]])

### Run models, collect statistics.
For consolidated statistics/results see Capstone_Results.xlsx in this directory

In [83]:
classifiers = [DecisionTreeClassifier(max_depth=7), \
    RandomForestClassifier(max_depth=7, n_estimators=10, max_features=1), \
    GaussianNB(), LogisticRegression()]


In [84]:
    for clf in classifiers:
        clf.fit(X_train_fit, y_train)
        score = clf.score(X_test_xform, y_test)
        y_pred = clf.predict(X_test_xform)
        cm = confusion_matrix(y_test, y_pred)
        fpr, tpr, thresholds = roc_curve(y_test, y_pred)
        print clf
        print score
        print cm
        print "AUC Metrics:"
        print auc(fpr, tpr)
        print

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=7,
            max_features=None, max_leaf_nodes=None, min_samples_leaf=1,
            min_samples_split=2, min_weight_fraction_leaf=0.0,
            presort=False, random_state=None, splitter='best')
0.886511627907
[[67369   516]
 [ 8146   294]]
AUC Metrics:
0.513616516572

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=7, max_features=1, max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
0.889315427448
[[67853    32]
 [ 8416    24]]
AUC Metrics:
0.501186108232

GaussianNB()
0.864120537177
[[64817  3068]
 [ 7303  1137]]
AUC Metrics:
0.544760780795

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n