# You are breaking my heart

This dataset contains information on 303 patients. Several medically relevant data are available (age, sex, cholesterol, resting blood pressure...). Our task is to predict the presence of heart disease (column "target", 0 means healty, 1 means sick).

This dataset is described in detail:

* on Kaggle datasets: https://www.kaggle.com/ronitf/heart-disease-uci
* on its original webpage: https://archive.ics.uci.edu/ml/datasets/Heart+Disease

I've downloaded a copy of the data and made it available at the following url:

In [None]:
DATASET_URL = 'https://raw.githubusercontent.com/ne1s0n/coding_excercises/master/data/datasets_33180_43520_heart.csv'

# Battleplan

* load dataset
* a bit of data visualization/exploration
* a baseline classifier: logistic regression
* improve the data, improve the classifier
* a better NN classifier: let's add a layer
  * doing a proper crossvalidation


# Config

In [None]:
#let's fix already our desired number of epochs
EPOCHS = 200

To be sure to have the same results, we can fix the random seeds.

In [None]:
#general random seed
from numpy.random import seed
seed(0)

#tensorflow-specific seed
import tensorflow
tensorflow.random.set_seed(0)

# Data

In [None]:
import pandas

#pandas can read a csv directly from a url
heart_data = pandas.read_csv(DATASET_URL)
print(heart_data)

In [None]:
#splitting features and target
features = heart_data.iloc[:,:-1]
target = heart_data.iloc[:,-1]

In [None]:
#take a look at what we have done
print(heart_data.columns)
print(features.shape)
print(target.shape) #beware of rank 1 arrays

## Train and Validation sets

In [None]:
#we want to have the same proportion of classes in both train and validation sets
from sklearn.model_selection import StratifiedShuffleSplit

#building a StratifiedShuffleSplit object (sss among friends) with 20% data
#assigned to validation set (here called "test")
sss = StratifiedShuffleSplit(n_splits=1, test_size=0.2, random_state=0)

#the .split() method returns (an iterable over) two lists which can be
#used to index the samples that go into train and validation sets
for train_index, val_index in sss.split(features, target):
    features_train = features.iloc[train_index, :]
    features_val   = features.iloc[val_index, :]
    target_train   = target[train_index]
    target_val     = target[val_index]
    
#let's print some shapes to get an idea of the resulting data structure
print(features_train.shape)
print(features_val.shape)
print(target_train.shape)
print(target_val.shape)

# Baseline predictor: logistic regression

In [None]:
#we are building a "sequential" model, meaning that the data will 
#flow like INPUT -> ELABORATION -> OUTPUT. In particular, we will
#not have any loops, i.e. our output will never be recycled as
#input for the first layer
from keras.models import Sequential

#a "dense" layer is a layer were all the data coming in are connected
#to all nodes. In our case there is only one node in the layer, and
#it receives all the features
from keras.layers import Dense

import keras.metrics

# 2-class logistic regression in Keras
model = Sequential()
model.add(Dense(1, activation='sigmoid', input_dim=features_train.shape[1]))

#the model is declared, but we still need to compile it to actually
#build all the data structures
model.compile(optimizer='rmsprop', loss='binary_crossentropy', 
    metrics=[
      keras.metrics.BinaryAccuracy(),
      keras.metrics.AUC(name='auc'),
    ])

In [None]:
history = model.fit(features_train, target_train, epochs=10, validation_data=(features_val, target_val))

In [None]:
print(history.history.keys())

In [None]:
#function to take a look at losses and metrics evolution
import matplotlib.pyplot as plt
def plot_loss_history(h, title):
  for metric in h.history.keys():
    #ignoring metrics on validation set, which are implied when
    #plotting on training set
    if metric.startswith('val_'):
      continue
    
    #if we get here we found a metric on the training set,
    #let's plot it
    plt.plot(h.history[metric], label = "Train set")
    plt.plot(h.history["val_" + metric], label = "Validation set")
    plt.xlabel('Epochs')
    plt.title(title + ' - ' + metric)
    plt.legend()
    plt.show()

In [None]:
plot_loss_history(history, 'Logistic (10 epochs)')

In [None]:
#putting verbose to 0 to avoid filling the screen
history2 = model.fit(features_train, target_train, epochs=(EPOCHS - 10), 
                     validation_data=(features_val, target_val), verbose=0)

In [None]:
#putting together the whole history
for k in history.history.keys():
  history.history[k] += history2.history[k]

#and plotting again
plot_loss_history(history, 'Logistic (' + str(EPOCHS) + ' epochs)')

# Improvement: data normalization

In [None]:
#getting an idea about features averages, sd
avg = features_train.mean()
std = features_train.std()
print('Feature means')
print(avg)
print('\nFeature standard deviations')
print(std)

In [None]:
#normalizing features, using the same weights for both
#train and validation test
features_train = (features_train - avg)/std
features_val = (features_val - avg)/std

# Improvement: class balancing

In [None]:
#comparing the number of samples for each class
N_TOT = target_train.shape[0]
N_DISEASE = target_train.sum()
N_NORMAL = N_TOT - N_DISEASE

print('Total samples: ' + str(N_TOT) + ' (normal:' + str(N_NORMAL) + ' diseases:' + str(N_DISEASE) + ')')

In [None]:
#https://www.tensorflow.org/api_docs/python/tf/keras/Model#fit and
#whatch for argument class_weight

#handy function, to be reused later
def get_weights(target):
  #counting the number of instances for each class
  class_1_cnt = target_train.sum()
  class_0_cnt = target.shape[0] - target_train.sum()

  #we get weights as inverse of class ratios
  weight_for_0 = (class_0_cnt + class_1_cnt) / class_0_cnt
  weight_for_1 = (class_0_cnt + class_1_cnt) / class_1_cnt

  #and we are done
  return({0: weight_for_0, 1: weight_for_1})


#let's we invoke the function right away
class_weight = get_weights(target_train)
print('Computed weights:' + str(class_weight))

# Improvement: better model

In [None]:
#let's keep it simple: adding a Dense ReLU layer

# 2-class logistic regression in Keras
model2 = Sequential()
model2.add(Dense(10, activation='relu', input_dim=features_train.shape[1]))
model2.add(Dense(1, activation='sigmoid'))

#the model is declared, but we still need to compile it to actually
#build all the data structures
model2.compile(optimizer='rmsprop', loss='binary_crossentropy', 
    metrics=[
      keras.metrics.BinaryAccuracy(),
      keras.metrics.AUC(),
    ])

# Train again!

In [None]:
#train with normalized data, class weights, improved model, same epochs
history_m2 = model2.fit(
    features_train, target_train, 
    epochs=EPOCHS, 
    validation_data=(features_val, target_val), 
    class_weight = class_weight,
    verbose=0)

In [None]:
#take a look at our results
plot_loss_history(history_m2, 'Improved NN')

In [None]:
#a direct comparison of loss functions
plt.subplot(1, 2, 1)
plt.plot(history.history['loss'], label='Model 1')
plt.plot(history_m2.history['loss'], label='Model 2')
plt.title('Train set loss')
plt.legend()
plt.xlabel('Epochs')

plt.subplot(1, 2, 2)
plt.plot(history.history['val_loss'], label='Model 1')
plt.plot(history_m2.history['val_loss'], label='Model 2')
plt.title('Validation set loss')
plt.legend()
plt.xlabel('Epochs')

plt.show()

In [None]:
#taking a look at final val accuracy

print('Binary accuracy on validation set')
print(history.history['val_binary_accuracy'][-1])
print(history_m2.history['val_binary_accuracy'][-1])

print('\nLoss on validation set')
print(history.history['val_loss'][-1])
print(history_m2.history['val_loss'][-1])

# Improvement: fine tuning number of units

## Train, validation and TEST sets!

<img src="https://drive.google.com/uc?id=1-9uP7NfHGUx-TtZzKcil3W6ehx28YRCI" width=600/>

In [None]:
#support function to have a leaner code down below:
#input: train set, test set, number of units in the hidden layer
#output: train history object

def train_NN(feat_tr, feat_val, tar_tr, tar_val, n_units):
    #same 1 hidden layer model as above
    m = Sequential()
    m.add(Dense(units = n_units, activation='relu', input_dim=feat_tr.shape[1]))
    m.add(Dense(1, activation='sigmoid'))

    #the model is declared, but we still need to compile it to actually
    #build all the data structures
    m.compile(optimizer='rmsprop', loss='binary_crossentropy', 
        metrics=[
          keras.metrics.BinaryAccuracy(),
          keras.metrics.AUC(name='auc'),
        ])
    
    #compute class weights for this specific split
    cw = get_weights(tar_tr)

    #ready to train!
    h = m.fit(
      feat_tr, tar_tr, 
      epochs=EPOCHS, 
      validation_data=(feat_val, tar_val), 
      class_weight = cw,
      verbose=0)
    
    #and we are done
    return(h)

In [None]:
#we now further split the train set to do a proper crossvalidation using
#again scikit-learn, but this time we want the indexes for each split
#so we are going to instantiate a StratifiedKFold object
#https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html#sklearn.model_selection.StratifiedKFold
from sklearn.model_selection import StratifiedKFold

skf = StratifiedKFold(n_splits = 5)

#as done above, the .split() method returns (an iterable over) two lists which 
#can be used to index the samples that go into train and test sets

#a loop tracker, useful for indexing and printing messages
fold = 0

#room to store all the training histories
all_histories = {}

#let's explore these possible number of units
layer_units = [2, 4, 8, 16, 32]

#loop over folds
for train_index_cv, test_index_cv in skf.split(features_train, target_train):
    features_train_cv = features_train.iloc[train_index_cv, :]
    features_test_cv  = features_train.iloc[test_index_cv, :]
    target_train_cv   = target_train.iloc[train_index_cv]
    target_test_cv    = target_train.iloc[test_index_cv]

    #user interface
    fold += 1
    print('Doing fold ' + str(fold))

    #room to store all the training histories
    all_histories[fold] = {}

    #loop over considered number of units
    for lu in layer_units:
      #user interface
      print(' - training with lu=' + str(lu))

      #training the network, storing the training history
      all_histories[fold][lu] = train_NN(
          features_train_cv, features_test_cv, 
          target_train_cv, target_test_cv, lu)


In [None]:
#all_histories is a bit messy, order of indexing is:
#fold -> number of units -> actual history object
#let's take a look at all these indexes:
print(all_histories.keys())
print(all_histories[1].keys())
print(all_histories[1][16].history.keys())

In [None]:
#for each considered number of units we have five different executions
#let's take a look at one
lu = 16
plt.plot(all_histories[1][lu].history['val_loss'], label = 'Fold 1')
plt.plot(all_histories[2][lu].history['val_loss'], label = 'Fold 2')
plt.plot(all_histories[3][lu].history['val_loss'], label = 'Fold 3')
plt.plot(all_histories[4][lu].history['val_loss'], label = 'Fold 4')
plt.plot(all_histories[5][lu].history['val_loss'], label = 'Fold 5')
plt.xlabel('Epochs')
plt.title('Test set Loss for ' + str(lu) + ' units')
plt.legend()
plt.show()

In [None]:
#let's forget history and focus on metrics for last epoch
#also: let's average over folds

rows = []
for lu in layer_units:
  for fold in all_histories.keys():
    row = []
    row.append(lu)
    row.append(all_histories[fold][lu].history['val_loss'][-1])
    row.append(all_histories[fold][lu].history['val_binary_accuracy'][-1])
    row.append(all_histories[fold][lu].history['val_auc'][-1])
    rows.append(row)

#converting to pandas
df = pandas.DataFrame(rows, columns=["LU", "loss", "accuracy", "AUC"])

#average over folds
df = df.groupby(df['LU']).aggregate('mean')

#and take a look to the numbers
print(df)

In [None]:
#selecting LU with best loss
LU_selected = df.loc[:, 'loss'].idxmin()
print('Best performance with ' + str(LU_selected) + ' units in the hidden layer')

In [None]:
#train again, this time using the full train set, and evaluating in the validation set
history_m3 = train_NN(
    features_train, features_val,
    target_train, target_val,
    LU_selected)

#taking a look at final validation loss
print(history.history['val_loss'][-1])
print(history_m2.history['val_loss'][-1])
print(history_m3.history['val_loss'][-1])