In [1]:
import pandas as pd, numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler
import keras
from keras.layers import Dense, Dropout
from keras.models import Sequential
from keras.wrappers.scikit_learn import KerasClassifier
from sklearn.metrics import confusion_matrix

Using TensorFlow backend.


In [2]:
columns = ['Pregnancies', 'Glucose', 'BloodPressure', 'SkinThickness', 
           'Insulin', 'BMI', 'DiabetesPedigreeFunction', 'Age', 'Outcome']

df = pd.read_csv('pima-indians-diabetes.csv', names = columns)
df.shape


(768, 9)

In [3]:
#let's check if there are any na values in any column

df.isna().any() 

Pregnancies                 False
Glucose                     False
BloodPressure               False
SkinThickness               False
Insulin                     False
BMI                         False
DiabetesPedigreeFunction    False
Age                         False
Outcome                     False
dtype: bool

In [4]:
df[df == 0].head() # there are zeros in data let's replace them by the column mean

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,,,,,0.0,,,,
1,,,,,0.0,,,,0.0
2,,,,0.0,0.0,,,,
3,,,,,,,,,0.0
4,0.0,,,,,,,,


In [5]:
for col in df.columns[:-1]:
    df[col].replace(to_replace=0, value=np.mean(df[col]), inplace=True)
    
df.head(10) # replaces zero values in features by their column means

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6.0,148.0,72.0,35.0,79.799479,33.6,0.627,50,1
1,1.0,85.0,66.0,29.0,79.799479,26.6,0.351,31,0
2,8.0,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1.0,89.0,66.0,23.0,94.0,28.1,0.167,21,0
4,3.845052,137.0,40.0,35.0,168.0,43.1,2.288,33,1
5,5.0,116.0,74.0,20.536458,79.799479,25.6,0.201,30,0
6,3.0,78.0,50.0,32.0,88.0,31.0,0.248,26,1
7,10.0,115.0,69.105469,20.536458,79.799479,35.3,0.134,29,0
8,2.0,197.0,70.0,45.0,543.0,30.5,0.158,53,1
9,8.0,125.0,96.0,20.536458,79.799479,31.992578,0.232,54,1


In [6]:
# let's separate the features from labels

X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values

X.shape, y.shape

((768, 8), (768,))

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 0)

X_train.shape

(537, 8)

In [8]:
# no need for label encoding for features and labels
# feature scaling

sc_X = StandardScaler()
sc_X.fit_transform(X_train)
sc_X.transform(X_test)

array([[-1.14521076,  2.51470546,  0.26160481, ...,  1.45696508,
         2.7617022 , -0.97315788],
       [-0.81178972, -0.50731433,  0.0937013 , ...,  0.12441006,
        -0.20149854, -0.88977097],
       [-0.14494764, -1.52560361, -0.91371974, ...,  0.18172425,
        -0.24040926, -0.72299714],
       ...,
       [-1.14521076, -0.90149083, -1.08162325, ..., -1.26545916,
        -0.63250956, -0.97315788],
       [-1.14521076, -0.54016237,  0.26160481, ...,  0.68322346,
        -0.82107688, -0.63961023],
       [ 1.8555786 , -0.01459371,  0.42950831, ..., -0.73530286,
         0.12175972,  0.94474111]])

Use 1 output node. Output 0 (<0.5) is considered class A and 1 (>=0.5) is considered class B (in case of sigmoid)

Use 2 output nodes. The input belongs to the class of the node with the highest value/probability (argmax).

In the second case you are probably writing about softmax activation function. If that's true, than the sigmoid is just a special case of softmax function. That's easy to show.

$$y=1/(1+e^x)=1/(1+1/e^x)=1/((e^x+1)/e^x)=e^x/(1+e^x)=e^x/(e^0+e^x)$$

As you can see sigmoid is the same as softmax. You can think that you have two outputs, but one of them has all weights equal to zero and therefore its output will be always equal to zero.

So the better choice for the binary classification is to use one output unit with sigmoid instead of softmax with two output units, because it will update faster.

In [0]:
#Importing some required Libraries
from keras.layers import BatchNormalization
from keras.callbacks import ModelCheckpoint
from keras.regularizers import l2

In [10]:
EPOCHS = 200
batch_size = 20
#I have saved the best weights, so anytime you want to make prediction, just load the
#weights from this file
chkp = ModelCheckpoint("model.best.hdf5",save_best_only=True,verbose=1)
def model_baseline():
    model = Sequential()
    model.add(Dense(input_dim = 8, units = 100, activation = 'relu',kernel_regularizer=l2(0.01)))
    model.add(BatchNormalization())
    model.add(Dropout(0.20))
    model.add(Dense(units = 100, activation = 'relu',kernel_regularizer=l2(0.01)))
    model.add(BatchNormalization())
    model.add(Dropout(0.20))

    model.add(Dense(units = 10, activation = 'relu',kernel_regularizer=l2(0.01)))
    model.add(BatchNormalization())
    model.add(Dropout(0.15))

    model.add(Dense(units = 1, activation = 'sigmoid'))
    model.compile(optimizer = 'adam', loss = 'binary_crossentropy', metrics = ['accuracy'])
    return model    

model = model_baseline()
model.fit(X_train,y_train,batch_size=32,epochs=400,verbose=1,validation_split=0.2,shuffle=True,callbacks=[chkp])     
#I have commented out your Model       
# stratified_kfold = StratifiedKFold(n_splits=10)
# classifier = KerasClassifier(build_fn = model_baseline, epochs = EPOCHS, batch_size = batch_size)
# classifier.fit(X_train, y_train)
''' As I did not implement your model,I am commenting out this, also'''            
# results = cross_val_score(model, X_train, y_train, cv = stratified_kfold, n_jobs = -1, scoring = 'accuracy')
# print(results)
# print()
# print('The mean accuracy is: {}%'.format(np.mean(results) * 100))
# print('The variance is: {}'.format(np.var(results)))
              

Train on 429 samples, validate on 108 samples
Epoch 1/400

Epoch 00001: val_loss improved from inf to 4.22775, saving model to model.best.hdf5
Epoch 2/400

Epoch 00002: val_loss improved from 4.22775 to 3.54162, saving model to model.best.hdf5
Epoch 3/400

Epoch 00003: val_loss improved from 3.54162 to 3.03610, saving model to model.best.hdf5
Epoch 4/400

Epoch 00004: val_loss improved from 3.03610 to 2.62225, saving model to model.best.hdf5
Epoch 5/400

Epoch 00005: val_loss improved from 2.62225 to 2.25857, saving model to model.best.hdf5
Epoch 6/400

Epoch 00006: val_loss improved from 2.25857 to 2.01902, saving model to model.best.hdf5
Epoch 7/400

Epoch 00007: val_loss improved from 2.01902 to 1.86005, saving model to model.best.hdf5
Epoch 8/400

Epoch 00008: val_loss improved from 1.86005 to 1.71316, saving model to model.best.hdf5
Epoch 9/400

Epoch 00009: val_loss improved from 1.71316 to 1.62238, saving model to model.best.hdf5
Epoch 10/400

Epoch 00010: val_loss improved from

' As I did not implement your model,I am commenting out this, also'

Tried my best to brute force every hyperparameter manually each time instead running a grid search because I don't think my laptop will get that done in minutes :D

P.S: this is not the best accuracy I got but I just stuck to this one.

In [17]:
model.load_weights("model.best.hdf5")
predictions_ = model.predict(X_test)
# prob >=0.5 --> 1
predictions = []
for index,pred in enumerate(predictions_):
  pred_ = (pred>=0.5).astype(int)
  predictions.append(pred_)

df_pred = pd.Series(predictions_.flatten())
df_actual = pd.Series(y_test)

print(df_pred.value_counts())
df_actual.value_counts()

0.627548    3
0.632007    1
0.009333    1
0.627607    1
0.018209    1
           ..
0.297281    1
0.562222    1
0.786942    1
0.006671    1
0.215698    1
Length: 229, dtype: int64


0    157
1     74
dtype: int64

In [18]:
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(data=cm, columns=[['predicted','predicted'], ['Not Diabetic','Diabetic']],
                    index=[['actual','actual'], ['Not Diabetic','Diabetic']])

cm_df

Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,Not Diabetic,Diabetic
actual,Not Diabetic,133,24
actual,Diabetic,26,48


In [19]:
test_accuracy = ((133+48)/y_test.shape[0]) * 100
print('The accuracy of predicting the test set is {}%'.format(test_accuracy))

The accuracy of predicting the test set is 78.35497835497836%


In [0]:
#Remarks: I actually achieved 79.6%, but I forgot the hyperparameters and also due to random Initializations.