# Death Event recognizer using SMOTE

## Importing Project Dependencies

In [1]:
import tensorflow as tf
from tensorflow.keras import layers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib import style

import sklearn
from sklearn.metrics import confusion_matrix


import os
import tempfile
import seaborn as sns

## Data pre-processing

In [2]:
data = pd.read_csv('../input/heart-failure-clinical-data/heart_failure_clinical_records_dataset.csv')
data.head()

In [3]:
for i in data.columns:
    print(data[i].isnull().value_counts())

In [4]:
data.describe()

### Pre-processing steps
1. Remove unwanted attributes
2. Take logarithmic value of some attributes to reduce the variance
3. Standardize the data for better model performance

In [5]:
def normalize_continuous_vals(data1, exclude, logrized_vals, pop, thresh=0.001):
    
    for i in pop:
        data1 = data1.drop(i, axis=1)
    
    for i in logrized_vals:
        data1[i] = np.log(data1[i]) + thresh
    
    for i in data1.columns:
        if i in exclude:
            continue
        else:
            data1[i] = (data1[i] - data1[i].mean()) / data1[i].std()
    
    return data1

n_data = normalize_continuous_vals(data, ['anaemia', 'diabetes', 'high_blood_pressure', 'sex', 'smoking', 'DEATH_EVENT'], 
                                 ['creatinine_phosphokinase', 'platelets', 'serum_sodium'], ['time'])

In [6]:
(np.log(data['serum_sodium']) + 0.001 - (np.log(data['serum_sodium']) + 0.001 ).mean()) / (np.log(data['serum_sodium']) + 0.001 ).std()
n_data.head()

In [7]:
X = tf.cast(np.array(n_data.drop(['DEATH_EVENT'], axis=1)), tf.float32)
y = tf.cast(np.array(n_data['DEATH_EVENT']), tf.int32)
X.shape

In [8]:
def input_pipeline(X, y, batch_size, shuffle_buffer, split_ratio):
    
    """
    args:- 
        X: Feature values -> [tf.Tensor, np.array]
        y: Corresponding Lables -> [tf.Tensor, np.array]
        batch_size: batch_size for training
        split_ratio: percentage of data to split into training and validation datasets
        shuffle_buffer: buffer to consider while shuffling
    
    return:-
        train_ds: prefetched training dataset
        eval_ds: prefetched validation dataset
    """
    
    split_index = len(X) - int(len(X) * split_ratio)
    
    ds = tf.data.Dataset.from_tensor_slices((X, y))
    ds = ds.shuffle(shuffle_buffer)
    train_ds = ds.take(split_index).batch(batch_size)
    eval_ds = ds.skip(split_index).batch(batch_size)
    
    return (train_ds.prefetch(1), eval_ds.prefetch(1))

Now let us consider the distribution of the classes

In [9]:
style.use('ggplot')
plt.bar(['0', '1'], [n for n in data['DEATH_EVENT'].value_counts()], color='c')
plt.grid(False)
plt.show()

As seen in the graph, the data distribution is skewed, for resloving this issue we will use a method of oversampling known as '`Synthetic Minority Oversampling Technique (SMOTE)`

In [10]:
neg_samples, pos_samples = [n for n in data['DEATH_EVENT'].value_counts()]
initial_bias = tf.math.log(pos_samples / neg_samples)
initial_bias

In [11]:
def build_model(metrics, name, out_bias=None):
    if out_bias is not None:
        out_bias = tf.keras.initializers.Constant(out_bias)
        
    
    inputs = layers.Input(shape=(11,))
    x = layers.Dense(2048, activation='relu')(inputs)
    x = layers.Dense(1024, activation='relu')(x)
    x = layers.Dropout(0.3)(x)
    x = layers.Dense(512, activation='relu')(x)
    x = layers.Dropout(0.2)(x)

    outputs = layers.Dense(1, activation='sigmoid', bias_initializer=out_bias)(x)
    
    model = tf.keras.Model(inputs=[inputs], outputs=[outputs], name=name)
    
    model.compile(metrics = metrics, loss=tf.keras.losses.BinaryCrossentropy(),
                 optimizer=tf.keras.optimizers.Adam())
    
    return model

METRICS = [
      tf.keras.metrics.TruePositives(name='tp'),
      tf.keras.metrics.FalsePositives(name='fp'),
      tf.keras.metrics.TrueNegatives(name='tn'),
      tf.keras.metrics.FalseNegatives(name='fn'), 
      tf.keras.metrics.BinaryAccuracy(name='accuracy'),
      tf.keras.metrics.Precision(name='precision'),
      tf.keras.metrics.Recall(name='recall'),
      tf.keras.metrics.AUC(name='auc'),
      tf.keras.metrics.AUC(name='prc', curve='PR')
]

### In order to see the effect of (SMOTE) we shall now lay a basline upon which we will try to improve!

This model is a raw model without considering the data skew, let's analyze it's performance

In [12]:
raw_model = build_model(METRICS, 'raw_model')
raw_model.summary()

This model has it's bias adjusted according to the ratio of the skew

In [13]:
baseline_model = build_model(METRICS, 'baseline_model', initial_bias)
baseline_model.summary()

In [14]:
train_ds, eval_ds = input_pipeline(X, y, 2048, 299, 0.2)

In [15]:
raw_model.evaluate(train_ds)

In [16]:
baseline_model.evaluate(train_ds)

As can be seen from above two cases, only changing bias of the model according to the skew ratio, loss has been reduced to almost half of it's original value

In [17]:
initial_weights = os.path.join(tempfile.mkdtemp(), 'initial_weights')
baseline_model.save_weights(initial_weights)

### Performance of a baseline model

In [18]:
baseline_model = build_model(METRICS, 'baseline_model')
baseline_model.load_weights(initial_weights)
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_recall', 
    verbose=1,
    patience=10,
    mode='max',
    restore_best_weights=True)
baseline_history = baseline_model.fit(train_ds, validation_data=eval_ds, epochs=100, callbacks=[early_stopping])

In [19]:
def plot_metrics(history):
    plt.figure(figsize=(15, 10))
    metrics = ['loss', 'prc', 'precision', 'recall']
    colors = plt.rcParams['axes.prop_cycle'].by_key()['color']
    for n, metric in enumerate(metrics):
        name = metric.replace("_"," ").capitalize()
        plt.subplot(2,2,n+1)
        plt.plot(history.epoch, history.history[metric], color=colors[0], label='Train')
        plt.plot(history.epoch, history.history['val_'+metric],
                 color=colors[0], linestyle="--", label='Val')
        plt.xlabel('Epoch')
        plt.ylabel(name)
        if metric == 'loss':
            plt.ylim([0, plt.ylim()[1]])
        elif metric == 'auc':
            plt.ylim([0.8,1])
        else:
            plt.ylim([0,1])

    plt.legend()

In [20]:
plot_metrics(baseline_history)

In [21]:
def plot_confusion_matrix(labels, predictions, p=0.5):
    
    for i in labels:
        labels = i[1]
    
    
    cm = confusion_matrix(labels, predictions > p)
    plt.figure(figsize=(5,5))
    sns.heatmap(cm, annot=True, fmt="d")
    plt.title('Confusion matrix @{:.2f}'.format(p))
    plt.ylabel('Actual label')
    plt.xlabel('Predicted label')

    print('Legitimate Transactions Detected (True Negatives): ', cm[0][0])
    print('Legitimate Transactions Incorrectly Detected (False Positives): ', cm[0][1])
    print('Fraudulent Transactions Missed (False Negatives): ', cm[1][0])
    print('Fraudulent Transactions Detected (True Positives): ', cm[1][1])
    print('Total Fraudulent Transactions: ', np.sum(cm[1]))

In [22]:
baseline_results = baseline_model.evaluate(eval_ds)
for name, value in zip(baseline_model.metrics_names, baseline_results):
    print(name, ': ', value)
print()

plot_confusion_matrix(eval_ds, baseline_model.predict(eval_ds))

# Performance of a weighted model

In [23]:
total = neg_samples + pos_samples
weight_for_0 = (1 / neg_samples) * (total / 2.0)
weight_for_1 = (1 / pos_samples) * (total / 2.0)

class_weight = {0: weight_for_0, 1: weight_for_1}

print('Weight for class 0: {:.2f}'.format(weight_for_0))
print('Weight for class 1: {:.2f}'.format(weight_for_1))

In [24]:
weighted_model = build_model(METRICS, 'baseline_model')
weighted_model.load_weights(initial_weights)
early_stopping = tf.keras.callbacks.EarlyStopping(
monitor='val_loss', 
    verbose=1,
    patience=20,
    mode='max',
    restore_best_weights=True)
weighted_history = weighted_model.fit(train_ds, validation_data=eval_ds, epochs=100, callbacks=[early_stopping],
                                       class_weight=class_weight)

In [25]:
plot_metrics(weighted_history)

In [26]:
weighted_results = weighted_model.evaluate(eval_ds)
for name, value in zip(weighted_model.metrics_names, weighted_results):
    print(name, ': ', value)
print()

plot_confusion_matrix(eval_ds, weighted_model.predict(eval_ds))

## SMOTE

In [27]:
li = []
for i in train_ds:
    li.append(i[0])
    li.append(i[1])

    
li

In [28]:
bool_train_labels = li[1] != 0

In [29]:
len(bool_train_labels) - np.sum(bool_train_labels)

In [30]:
for i in train_ds:
    print(i[0])

In [31]:
pos_features = li[0][bool_train_labels]
neg_features = li[0][~bool_train_labels]

pos_labels = li[1][bool_train_labels]
neg_labels = li[1][~bool_train_labels]

In [32]:
BUFFER_SIZE = 100000

def make_ds(features, labels):
    ds = tf.data.Dataset.from_tensor_slices((features, labels))#.cache()
    ds = ds.shuffle(BUFFER_SIZE).repeat()
    return ds

pos_ds = make_ds(pos_features, pos_labels)
neg_ds = make_ds(neg_features, neg_labels)

In [33]:
for features, label in pos_ds.take(1):
    print("Features:\n", features.numpy())
    print()
    print("Label: ", label.numpy())

resampled_ds = tf.data.experimental.sample_from_datasets([pos_ds, neg_ds], weights=[0.5, 0.5])
resmapled_ds = resampled_ds.shuffle(BUFFER_SIZE)
resampled_ds = resampled_ds.batch(1024).prefetch(2)

In [34]:
for features, label in resampled_ds.take(1):
  print(label.numpy().mean())

In [35]:
resampled_steps_per_epoch = np.ceil(2.0*164/64)
resampled_steps_per_epoch

In [36]:
resampled_model = build_model(METRICS, 'resampled_model')
resampled_model.load_weights(initial_weights)

output_layer = resampled_model.layers[-1] 
output_layer.bias.assign([0])

resampled_history = resampled_model.fit(resampled_ds, validation_data=eval_ds, epochs=1000, callbacks=[early_stopping],
                                       steps_per_epoch=resampled_steps_per_epoch)

In [37]:
resampled_model.evaluate(eval_ds)

In [38]:
plot_metrics(resampled_history)

In [39]:
resampled_results = resampled_model.evaluate(eval_ds)
for name, value in zip(resampled_model.metrics_names, resampled_results):
    print(name, ': ', value)
print()

plot_confusion_matrix(eval_ds, resampled_model.predict(eval_ds))

# SMOTE gives the best results seen so far, hence by using this technique optimizing recall score is possible even for a small dataset!