In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_california_housing
%matplotlib inline

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import pathlib

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import seaborn as sns

import util

import tensorflow as tf

from tensorflow import keras
from tensorflow.keras import layers

print(tf.__version__)

2.1.0


In [3]:
df = pd.read_csv('heart_failure_clinical_records_dataset.csv')

Train/Test Split

In [4]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [5]:
train.head()

Unnamed: 0,age,anaemia,creatinine_phosphokinase,diabetes,ejection_fraction,high_blood_pressure,platelets,serum_creatinine,serum_sodium,sex,smoking,time,DEATH_EVENT
241,65.0,0,582,1,30,0,249000.0,1.3,136,1,1,212,0
215,73.0,0,582,0,35,1,203000.0,1.3,134,1,0,195,0
287,45.0,0,582,1,55,0,543000.0,1.0,132,0,0,250,0
30,94.0,0,582,1,38,1,263358.03,1.83,134,1,0,27,1
258,45.0,1,66,1,25,0,233000.0,0.8,135,1,0,230,0


# Create Model 

In [6]:
import models

# set initial bias
neg, pos = np.bincount(df['DEATH_EVENT'])# set count negative and positive samples
initial_bias = np.log([pos/neg])


mobj = models.modelObj()
model = mobj.create_model(len(train.keys())-1, bias = initial_bias)

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 16)                208       
_________________________________________________________________
dropout (Dropout)            (None, 16)                0         
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 34        
Total params: 242
Trainable params: 242
Non-trainable params: 0
_________________________________________________________________


# Normalize Data

In [7]:
train_y = train.pop('DEATH_EVENT')
train_x = util.norm(train)
train_x.head()

test_y = test.pop('DEATH_EVENT')
test_x = util.norm(test)

# Train Model

In [8]:
early_stopping = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)

In [9]:
train_x.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,0.454545,0.0,0.071319,1.0,0.272727,0.0,0.271427,0.089888,0.657143,1.0,1.0,0.751825
1,0.6,0.0,0.071319,0.0,0.363636,1.0,0.215663,0.089888,0.6,1.0,0.0,0.689781
2,0.090909,0.0,0.071319,1.0,0.727273,0.0,0.627834,0.05618,0.542857,0.0,0.0,0.890511
3,0.981818,0.0,0.071319,1.0,0.418182,1.0,0.288833,0.149438,0.6,1.0,0.0,0.076642
4,0.090909,1.0,0.005486,1.0,0.181818,0.0,0.252031,0.033708,0.628571,1.0,0.0,0.817518


In [10]:
print(np.unique(train_y))

non_resampled_history = model.fit(
    train_x,
    train_y,
    batch_size=mobj.BATCH_SIZE,
    epochs=mobj.EPOCHS,
    callbacks = [early_stopping, models.printDot()],
    validation_split=0.2
    )

[0 1]


ValueError: A target array with shape (239, 1) was passed for an output of shape (None, 2) while using as loss `binary_crossentropy`. This loss expects targets to have the same shape as the output.

In [None]:
util.plot_metrics(non_resampled_history)

# Evaluate the Model

In [None]:
#test_y = test.pop('DEATH_EVENT')
#test_x = test
test_predictions_no_r = model.predict(test_x, batch_size=mobj.BATCH_SIZE)

In [None]:
from sklearn.metrics import balanced_accuracy_score
from sklearn.metrics import confusion_matrix

#print(test_y.shape)
#print(test_predictions_no_r.shape)
#print(test_predictions_no_r)

bscore_no_res = balanced_accuracy_score(test_y, test_predictions_no_r)

cm = confusion_matrix(test_y, test_predictions_no_r)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Balanced Accuracy Score: {0}'.format(bscore_no_res)
plt.title(all_sample_title, size = 15);

In [None]:
no_res_results = model.evaluate(test_x, test_y,
                                  batch_size=mobj.BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, no_res_results):
  print(name, ': ', value)
print()

# UnderSampling (NearMiss1)

In [None]:
df.head()

In [None]:
df.describe()

# Apply Resampling to Dataset

In [None]:
from imblearn.under_sampling import NearMiss

raw_df = df.copy(deep=True)

y = raw_df.pop('DEATH_EVENT')
X = raw_df

nm1 = NearMiss(version=1)

X_resampled, y_resampled = nm1.fit_sample(X, y)

nm1_df = pd.DataFrame(X_resampled, columns=X.columns)
nm1_df['DEATH_EVENT'] = y_resampled
nm1_df.head()

In [None]:
nm1_df.describe()

# Train/Test Split

In [None]:
train, test = train_test_split(nm1_df, test_size=0.2)

# Create New Model 

In [None]:
# Create Model 

import models

# set initial bias
neg, pos = np.bincount(nm1_df['DEATH_EVENT'])# set count negative and positive samples
initial_bias = np.log([pos/neg])

mobj = models.modelObj()
model = mobj.create_model(len(train.keys())-1, bias = initial_bias)

model.summary()

# Normalize Data

In [None]:
# Normalize Data

train_y = train.pop('DEATH_EVENT')
train_x = util.norm(train)
train_x.head()

test_y = test.pop('DEATH_EVENT')
test_x = util.norm(test)
train_x.head()

In [None]:
nm1_resampled_history = model.fit(
    train_x,
    train_y,
    batch_size=mobj.BATCH_SIZE,
    epochs=mobj.EPOCHS,
    callbacks = [early_stopping, models.printDot()],
    validation_split=0.2)

In [None]:
util.plot_metrics(nm1_resampled_history)

# Evaluate Model

In [None]:
#test_y = test.pop('DEATH_EVENT')
#test_x = test
test_predictions_nm1 = model.predict(test_x, batch_size=mobj.BATCH_SIZE)


bscore_no_res = balanced_accuracy_score(test_y, test_predictions_nm1)

cm = confusion_matrix(test_y, test_predictions_nm1)
plt.figure(figsize=(9,9))
sns.heatmap(cm, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Balanced Accuracy Score: {0}'.format(bscore_no_res)
plt.title(all_sample_title, size = 15);

nm1_results = model.evaluate(test_x, test_y,
                                  batch_size=mobj.BATCH_SIZE, verbose=0)
for name, value in zip(model.metrics_names, no_res_results):
  print(name, ': ', value)
print()

# OneSidedSelection

In [None]:
raw_df = df.copy(deep=True)
raw_df.head()

Using batch generator from imbalanced learn

In [None]:
train, test = train_test_split(nm1_df, test_size=0.2)

In [None]:
# set initial bias
neg, pos = np.bincount(raw_df['DEATH_EVENT'])# set count negative and positive samples
initial_bias = np.log([pos/neg])

# Normalize Data

In [None]:
# Create Model
#y = train.pop('DEATH_EVENT')
#y_train = keras.utils.to_categorical(y, 3)
y_train =train.pop('DEATH_EVENT')
X_train = train
y_test =test.pop('DEATH_EVENT')
X_test = test
mobj = models.modelObj()
model = mobj.create_model(len(X.keys())-1, bias = initial_bias)

In [None]:
X_train = util.norm(X_train)
X_test = util.norm(X_test)
train_x.head()

In [None]:
from imblearn.keras import balanced_batch_generator
from imblearn.under_sampling import OneSidedSelection

# fit the model using generator
training_generator, steps_per_epoch = balanced_batch_generator(
    X_train, y_train, sampler=OneSidedSelection(random_state=0), batch_size=mobj.BATCH_SIZE, random_state=42)

callback_history_oss = model.fit_generator(generator=training_generator,
                                        steps_per_epoch=steps_per_epoch,
                                        epochs=mobj.EPOCHS, verbose=0)