# Objective: Create model to predict onset of Diabetes amongst Pima Indians #

We will be using the Pima Indians dataset which records the onset of Diabetes within 5 years.
 
* Number of Instances: 768
* Number of Attributes: 8 plus class 
*  For Each Attribute: (all numeric-valued)
       1. Number of times pregnant
       2. Plasma glucose concentration a 2 hours in an oral glucose tolerance test
       3. Diastolic blood pressure (mm Hg)
       4. Triceps skin fold thickness (mm)
       5. 2-Hour serum insulin (mu U/ml)
       6. Body mass index (weight in kg/(height in m)^2)
       7. Diabetes pedigree function
       8. Age (years)
       9. Class variable (0: no diabetes or 1: diabetic)

# Check backend of Keras configuration #

`
Adarshs-MacBook-Pro:Github adarshnair$ cd ~/.keras
Adarshs-MacBook-Pro:.keras adarshnair$ ls
keras.json
Adarshs-MacBook-Pro:.keras adarshnair$ cat keras.json
{
    "image_dim_ordering": "tf", 
    "epsilon": 1e-07, 
    "floatx": "float32", 
    "backend": "tensorflow"
}
Adarshs-MacBook-Pro:.keras adarshnair$ `


## Part 1: Step 1: Load dataset ##

In [1]:
# Create your first MLP in Keras
from keras.models import Sequential
from keras.layers import Dense
import numpy
# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# load pima indians dataset
dataset = numpy.loadtxt("pima-indians-diabetes.csv", delimiter=",")

Using TensorFlow backend.


## Step 2: Split data and define model ##

In [2]:
'''
Fully connected layers are defined using the Dense class. 
'''

# split into input (X) and output (Y) variables
# features
X = dataset[:, 0:8]
# target
Y = dataset[:, 8]

# create Sequential model
model = Sequential()

# Input layer - takes 8 input features
model.add(Dense(12, # number of neurons
                input_dim = 8, # number of input features
                init='uniform', # initialize network weights to random number generated from a uniform distribution
                activation='relu'))

# Hidden layer - takes 8 input features
model.add(Dense(8, 
                init='uniform', 
                activation='relu'))

# Output layer - 
model.add(Dense(1, # 1 neuron to give output
                init='uniform', 
                activation='sigmoid'))

## Step 3: Compile and train model ## 

* loss: to evaluate the weights
* optimizer: to search through the different weights for the network and find the best ones

In [None]:
# Compile model
model.compile(loss='binary_crossentropy', # logarithmic loss function
              optimizer='adam', # gradient descent optimizer
              metrics=['accuracy']) # accuracy metric as it is binary classification problem

### Automatic verification dataset ### 

In [6]:
'''
Fit the model using an automatic verification dataset
'''
model.fit(X, # features
          Y, # target
          validation_split = 0.33,
          nb_epoch=150, # number of iterations
          batch_size=10) # number of instances evaluated before a weight update

Train on 514 samples, validate on 254 samples
Epoch 1/150
Epoch 2/150
Epoch 3/150
Epoch 4/150
Epoch 5/150
Epoch 6/150
Epoch 7/150
Epoch 8/150
Epoch 9/150
Epoch 10/150
Epoch 11/150
Epoch 12/150
Epoch 13/150
Epoch 14/150
Epoch 15/150
Epoch 16/150
Epoch 17/150
Epoch 18/150
Epoch 19/150
Epoch 20/150
Epoch 21/150
Epoch 22/150
Epoch 23/150
Epoch 24/150
Epoch 25/150
Epoch 26/150
Epoch 27/150
Epoch 28/150
Epoch 29/150
Epoch 30/150
Epoch 31/150
Epoch 32/150
Epoch 33/150
Epoch 34/150
Epoch 35/150
Epoch 36/150
Epoch 37/150
Epoch 38/150
Epoch 39/150
Epoch 40/150
Epoch 41/150
Epoch 42/150
Epoch 43/150
Epoch 44/150
Epoch 45/150
Epoch 46/150
Epoch 47/150
Epoch 48/150
Epoch 49/150
Epoch 50/150
Epoch 51/150
Epoch 52/150
Epoch 53/150
Epoch 54/150
Epoch 55/150
Epoch 56/150
Epoch 57/150
Epoch 58/150
Epoch 59/150
Epoch 60/150
Epoch 61/150
Epoch 62/150
Epoch 63/150
Epoch 64/150
Epoch 65/150
Epoch 66/150
Epoch 67/150
Epoch 68/150
Epoch 69/150
Epoch 70/150
Epoch 71/150
Epoch 72/150
Epoch 73/150
Epoch 74/150
E

<keras.callbacks.History at 0x10420c9d0>

### Manual verification dataset ###

In [None]:
'''
Fit the model using a manual verification dataset
'''
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    Y, 
                                                    test_size=0.33, 
                                                    random_state=seed)
model.fit(X_train, # features
          y_train, # target
          validation_data=(X_test,y_test),
          nb_epoch=150, # number of iterations
          batch_size=10) # number of instances evaluated before a weight update

### Stratified k-fold cross validation ###

In [10]:
from sklearn.model_selection import StratifiedKFold

# Stratified split
kfold = StratifiedKFold(n_splits=10, 
                        shuffle=True, 
                        random_state=seed)
cvscores = []
for train, test in kfold.split(X, Y):
    # create model
    model = Sequential()
    model.add(Dense(12, 
                    input_dim=8, 
                    init='uniform', 
                    activation='relu'))
    model.add(Dense(8, 
                    init='uniform', 
                    activation='relu'))
    model.add(Dense(1, 
                    init='uniform', 
                    activation='sigmoid'))

    # Compile model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    # Fit the model
    model.fit(X[train], Y[train], nb_epoch=150, batch_size=10, verbose=0)

    # evaluate the model
    scores = model.evaluate(X[test], Y[test], verbose=0)

    print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
    cvscores.append(scores[1] * 100)

print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

acc: 71.43%
acc: 71.43%
acc: 75.32%
acc: 80.52%
acc: 79.22%
acc: 75.32%
acc: 75.32%
acc: 74.03%
acc: 75.00%
acc: 73.68%
75.13% (+/- 2.77%)


## Step 4: Performance evaluation ##

In [4]:
scores = model.evaluate(X, Y)
print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))



## Part 2: Implementing model using KerasClassifier ##

* KerasClassifier
* cross_val_score

In [11]:
# MLP for Pima Indians Dataset with 10-fold cross validation via sklearn
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.model_selection import cross_val_score

# Function to create model, required for KerasClassifier
def create_model():
	# create model
	model = Sequential()
    
    # Input layer
	model.add(Dense(12, 
                    input_dim=8, 
                    init='uniform', 
                    activation='relu'))
    # Hidden layer
	model.add(Dense(8, 
                    init='uniform', 
                    activation='relu'))
    
    # Output layer
	model.add(Dense(1, 
                    init='uniform', 
                    activation='sigmoid'))
	# Compile model
	model.compile(loss='binary_crossentropy', 
                  optimizer='adam', 
                  metrics=['accuracy'])
	return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# load pima indians dataset
dataset = numpy.loadtxt("pima-indians-diabetes.csv", 
                        delimiter=",")

# split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# create model
model = KerasClassifier(build_fn = create_model, 
                        nb_epoch=150, 
                        batch_size=10, 
                        verbose=0)


# evaluate using 10-fold cross validation
kfold = StratifiedKFold(n_splits = 10, 
                        shuffle = True, 
                        random_state = seed)

# cross validated score
results = cross_val_score(model, 
                          X, 
                          Y, 
                          cv = kfold)
print(results.mean())

0.742139439747


## Part 3: Implementing model using Grid Search ##

Grid search is used to evaluate different configurations for the NN model and report the combination that provides the best estimated performance. 

* Optimizers: searching for different weight values
* Initializers: preparing for network weights using different schemes
* Number of epochs: for training the model to different number of exposures to the training set
* Batches: varying the number of samples before weight updates. 

In [None]:
# MLP for Pima Indians Dataset with grid search via sklearn
from sklearn.model_selection import GridSearchCV


# Function to create model, required for KerasClassifier
def create_model(optimizer='rmsprop', 
                 init='glorot_uniform'):
    # create model
    model = Sequential()
    model.add(Dense(12, 
                    input_dim=8, 
                    init=init, 
                    activation='relu'))
    model.add(Dense(8, 
                    init=init, 
                    activation='relu'))
    model.add(Dense(1, 
                    init=init, 
                    activation='sigmoid'))
    
    # Compile model
    model.compile(loss='binary_crossentropy', optimizer=optimizer, metrics=['accuracy'])
    return model

# fix random seed for reproducibility
seed = 7
numpy.random.seed(seed)

# Load pima indians dataset
dataset = numpy.loadtxt("pima-indians-diabetes.csv", 
                        delimiter=",")

# Split into input (X) and output (Y) variables
X = dataset[:,0:8]
Y = dataset[:,8]

# Create model
model = KerasClassifier(build_fn = create_model, 
                        verbose=0)
'''
Grid Seach part of the model.
'''
# Grid search optimizers, initializations, epochs, batch sizes. 
optimizers = ['rmsprop', 'adam']
init = ['glorot_uniform', 'normal', 'uniform']
epochs = numpy.array([50, 100, 150])
batches = numpy.array([5, 10, 20])

# Parameter grid dictionary
param_grid = dict(optimizer = optimizers, 
                  nb_epoch = epochs, 
                  batch_size = batches, 
                  init = init)

# Grid search CV object with the estimator set to the KerasClassifier and parameter grid dictionary to iterate over
grid = GridSearchCV(estimator = model, 
                    param_grid = param_grid)

# Fit the model using the GridSearchCV object.
grid_result = grid.fit(X, Y)

# Summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))

# Compute average scores
for params, mean_score, scores in grid_result.grid_scores_:
    print("%f (%f) with: %r" % (scores.mean(), scores.std(), params))