<a href="https://colab.research.google.com/github/noo-rashbass/synthetic-data-service/blob/master/Evaluation/discriminative_model_NEW_Lulu.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [31]:
import tensorflow as tf
from tensorflow.keras.models import model_from_json

import numpy as np
import pandas as pd

from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import train_test_split

Understanding the classification report: https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html#sklearn.metrics.classification_report

# Functions for Loading Data

Lulu's Notes:

* separated data loading into functions
* returned `hidden_dim` 
* replaced `mix_divide()` with `train_val_test_split()`

In [13]:
def reshape_removena_stack(ori_data):
  ori_data = np.split(ori_data, np.shape(ori_data)[0]/10, axis=0)
  ori_data_new = []
  for array in ori_data:
    if not np.isnan(array).any():
      ori_data_new.append(array)
  return ori_data_new

def load_DoppelGANger():
  ori_data = np.load('ori_features_prism.npy')
  gen_data = np.load('features_600.npy')
  return ori_data, gen_data

def load_tGAN():

  ori_data = pd.read_csv('cat_time_10visits_all_noid.csv').values # shape (12390 patients visits, 10 features)
  ori_data = reshape_removena_stack(ori_data) # shape (841 patients, 10 visits, 10 features)

  gen_data = np.load('gen_cat_time_10visits_wl_5000it.npy')[:np.shape(ori_data)[0]] # shape (841 patients, 10 visits, 10 features)

  return ori_data, gen_data

In [3]:
def MinMaxScaler(data): # This is a normalisation method copied from TGANs code # Lulu: not used
  """Min Max normalizer.
  
  Args:
    - data: original data
  
  Returns:
    - norm_data: normalized data
  """
  numerator = data - np.min(data, 0)
  denominator = np.max(data, 0) - np.min(data, 0)
  norm_data = numerator / (denominator + 1e-7)
  return norm_data


def InputSize(ori_data): # Set the input size to the model
    no, seq_len, dim = np.asarray(ori_data).shape 
    hidden_dim = int(dim/2)
    input_dim = [None,dim]
    return input_dim, hidden_dim # Lulu: added hidden_dim and renamed input_size because of later conflict

In [46]:
def train_val_test_split(ori_data, gen_data, rate=(0.65, 0.2, 0.15)): # Lulu: using sklearn, replaces mix_divide
  # rate = (train, val, test) must sum to one

  data = np.concatenate([ori_data,gen_data],axis=0)
  labels = np.concatenate([np.ones(len(ori_data)), np.zeros(len(gen_data))], axis=0)

  train_data, test_data, train_labels, test_labels = train_test_split(data, labels, test_size=rate[2])
  train_data, val_data, train_labels, val_labels = train_test_split(train_data, train_labels, train_size=rate[0]/(rate[0]+rate[1]))
  return train_data, val_data, test_data, train_labels, val_labels, test_labels

# Define Model

Lulu's Notes:

* No normalisation used
* Added internediate dense layer which improved score. This also makes the class much more flexible between "more features, shorter sequences" and "fewer features, longer sequences"
* Changed loss to `BinaryCrossentropy`

In [38]:
def discriminative_model(input_size, hidden_dim): 
    inputs = tf.keras.Input(shape = input_size)
    # normalised1 = LayerNormalization()(inputs1)
    GRU_output_sequence, GRU_last_state = tf.keras.layers.GRU(hidden_dim, return_sequences = True, return_state = True)(inputs)
    # Dense1 is the y_hat_logit in the original code
    Dense1 = tf.keras.layers.Dense(hidden_dim)(GRU_last_state) # Lulu: added intermediate dense layer with increased dimension, scores much better
    Dense2 = tf.keras.layers.Dense(1)(Dense1)

    # Acti1 is the y_hat in the original code
    # It is very odd that the original code seems to compare the result of Dense1 with the one-zero label # Lulu: it's OK, there are losses these types
    # while using Acti1 as the prediction result, but it doesn't make sense to me
    # I do what I think to be the right thing here - use Acti1 result as the prediction result

    Acti1 = tf.keras.layers.Activation(tf.keras.activations.sigmoid)(Dense2)  # Lulu: might not need separate activation layer
    
    model = tf.keras.Model(inputs = inputs, outputs = [Acti1])
    model.compile(optimizer = "adam", loss = tf.keras.losses.BinaryCrossentropy()) # Lulu: I think this is a better choice of loss for us
    
    return model 
                         


# tGAN

## Train

In [52]:
ori_data_tgan, gen_data_tgan = load_tGAN()
train_data_tgan, val_data_tgan, test_data_tgan, train_labels_tgan, val_labels_tgan, test_labels_tgan = train_val_test_split(ori_data=ori_data_tgan, gen_data=gen_data_tgan)
# Check shapes:
for array in [train_data_tgan, val_data_tgan, test_data_tgan, train_labels_tgan, val_labels_tgan, test_labels_tgan]:
  print(np.shape(array))


input_dim, hidden_dim = InputSize(ori_data_tgan)
model_tgan = discriminative_model(input_size=input_dim, hidden_dim=hidden_dim)

history_model_tgan = model_tgan.fit(train_data_tgan, train_labels_tgan, batch_size=128, epochs=200, validation_data=(val_data_tgan, val_labels_tgan))


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

## Evaluate

In [48]:
model_tgan.evaluate(test_data_tgan, test_labels_tgan) # keras built in evaluation



0.007770351134240627

In [53]:
test_raw_pred_tgan = model_tgan.predict(test_data_tgan)
test_pred_tgan = np.round(test_raw_pred_tgan)

print(classification_report(test_labels_tgan, test_pred_tgan, digits=5)) # more detailed classification report using sklearn

              precision    recall  f1-score   support

         0.0    0.98561   1.00000   0.99275       137
         1.0    1.00000   0.98276   0.99130       116

    accuracy                        0.99209       253
   macro avg    0.99281   0.99138   0.99203       253
weighted avg    0.99221   0.99209   0.99209       253



In [70]:
exp_acc_tgan = np.sum(test_labels_tgan)/np.shape(test_labels_tgan)[0]
print('Expected accuracy for an untrained discriminative model = ', str(exp_acc_tgan))
print('Final accuracy of trained discriminative model = ', str(accuracy_score(test_labels_tgan, test_pred_tgan)))

Expected accuracy for an untrained discriminative model =  0.45849802371541504
Final accuracy of trained discriminative model =  0.9920948616600791


# DoppelGANger

## Train

Lulu: I chose to increase the hidden dimension to 64 because there are only 5 features. This allows the additional dense layer to train from the longer sequences of 130 (compared to length 10 in the tGAN output). Accuracy improved significantly.

In [61]:
ori_data_dop, gen_data_dop = load_DoppelGANger()
train_data_dop, val_data_dop, test_data_dop, train_labels_dop, val_labels_dop, test_labels_dop = train_val_test_split(ori_data=ori_data_dop, gen_data=gen_data_dop)
# Check shapes
for array in [train_data_dop, val_data_dop, test_data_dop, train_labels_dop, val_labels_dop, test_labels_dop]:
  print(np.shape(array))


input_dim, hidden_dim = InputSize(ori_data_dop)
model_dop = discriminative_model(input_size=input_dim, hidden_dim=64)

history_model_dop = model_dop.fit(train_data_dop, train_labels_dop, batch_size=128, epochs=100, validation_data=(val_data_dop, val_labels_dop))


(1750, 130, 5)
(539, 130, 5)
(405, 130, 5)
(1750,)
(539,)
(405,)
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73

## Evaluate

In [62]:
model_dop.evaluate(test_data_dop, test_labels_dop) # keras built in evaluation



0.11682559549808502

In [63]:
test_raw_pred_dop = model_dop.predict(test_data_dop)
test_pred_dop = np.round(test_raw_pred_dop)

print(classification_report(test_labels_dop, test_pred_dop, digits=5)) # more detailed classification report using sklearn

              precision    recall  f1-score   support

         0.0    1.00000   0.90674   0.95109       193
         1.0    0.92174   1.00000   0.95928       212

    accuracy                        0.95556       405
   macro avg    0.96087   0.95337   0.95518       405
weighted avg    0.95903   0.95556   0.95537       405



In [69]:
exp_acc_dop = np.sum(test_labels_dop)/np.shape(test_labels_dop)[0]
print('Expected accuracy for an untrained discriminative model = ', str(exp_acc_dop))
print('Final accuracy of trained discriminative model = ', str(accuracy_score(test_labels_dop, test_pred_dop)))

Expected accuracy for an untrained discriminative model =  0.5234567901234568
Final accuracy of trained discriminative model =  0.9555555555555556
