# GANN - CHECK SPELLING

## Setup

Load modules

In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/ 

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import time
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler

Setup paths

In [3]:
data_path = '../data'
feature_path = data_path + '/tpm_combined.csv'
gene_name_path = data_path + '/tpm_combined_rows.csv'
cell_name_path = data_path + '/tpm_combined_cols.csv'

## Load data

Load datasets into frames and check all the shapes match up

In [4]:
df_gene_names = pd.read_csv(gene_name_path, header=None)
df_cell_names = pd.read_csv(cell_name_path, header=None)
df_training_data = pd.read_csv(feature_path, header=None)

print(df_gene_names.shape)
print(df_cell_names.shape)
print(df_training_data.shape)

(19248, 1)
(771, 1)
(19248, 771)


The number of genes in the input dataset determines the generator output as well as the dicriminator inputs

In [5]:
num_genes = df_gene_names.shape[0]
df_gene_names.shape

(19248, 1)

Take a look at the training data

In [6]:
df_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 19248 entries, 0 to 19247
Columns: 771 entries, 0 to 770
dtypes: float64(771)
memory usage: 113.2 MB


In [7]:
df_training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,761,762,763,764,765,766,767,768,769,770
0,4.217231,3.003602,4.209453,0.000000,5.296824,5.300856,5.587965,3.826803,3.414136,4.888013,...,4.441616,3.732269,3.347666,3.945795,3.503349,1.992768,0.739848,4.901108,4.623516,4.512859
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,3.547203,0.000000,0.000000,0.000000,0.000000,3.368768,0.000000,0.000000,1.269033,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.821838,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.835924,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
4,3.109361,5.938286,5.093391,0.000000,0.000000,2.693766,4.627607,6.537141,3.842979,2.786596,...,0.000000,2.545968,0.641546,0.000000,3.670161,3.389567,5.522307,3.014355,6.703627,2.608809
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19243,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
19244,0.000000,5.440952,2.097611,0.000000,0.000000,3.385431,3.339137,0.000000,5.599318,0.000000,...,6.193378,6.386294,2.114367,6.742680,0.000000,5.936402,0.000000,0.000000,6.265287,0.000000
19245,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
19246,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Check max values

In [8]:
training_data_max = df_training_data.max()
training_data_max = training_data_max.max()
print(training_data_max)

19.161062384674267


## Pre-process training data

Normalise input data

In [9]:
np_training_data = df_training_data.T.values
scaler = MinMaxScaler()
print(scaler.fit(np_training_data))

# Check which dimension we are fitting to - if we are fitting to gene expression then should be equal to number of genes
print(scaler.data_max_.shape)

MinMaxScaler(copy=True, feature_range=(0, 1))
(19248,)


In [10]:
np_training_data_norm = np.transpose(scaler.transform(np_training_data))
np_training_data_norm.shape

(19248, 771)

Get max values for noise generation

In [11]:
training_data_max = np_training_data_norm.max()
training_data_max = training_data_max.max()
print(training_data_max)

1.0000000000000002


## Define model variables - COMMENT ON EACH ONE TO DESCRIBE

In [12]:
# Model params
LATENT_VARIABLE_SIZE = 100
GEN_L1_DENSE_SIZE = 600
GEN_L2_DENSE_SIZE = 600
GEN_L3_DENSE_SIZE = num_genes

DIS_INPUT_SIZE = num_genes
DIS_L1_DENSE_SIZE = 200
DIS_L2_DENSE_SIZE = 200

NOISE_STDEV = training_data_max / 10
POISSON_LAM = 1

# Training params
TRAIN_BATCH_SIZE = 10
GEN_BATCH_SIZE = 10
BUFFER_SIZE = 10000
EPOCHS = 200
LEARNING_RATE = 0.001

In [13]:
print(NOISE_STDEV)

0.10000000000000002


## Create training dataset

Create tensors from training data - Convert to Int32 for better work on GPU with batch and shuffle

In [14]:
train_dataset = tf.data.Dataset.from_tensor_slices(df_training_data.T.values.astype('float32')).shuffle(BUFFER_SIZE).batch(TRAIN_BATCH_SIZE)
print(train_dataset)

<BatchDataset shapes: (None, 19248), types: tf.float32>


## Define GANN model

Define function for contructing the generator

In [15]:
def create_generator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(GEN_L1_DENSE_SIZE, use_bias=False, input_shape=(LATENT_VARIABLE_SIZE,)))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L1_DENSE_SIZE, 1)  # Note: None is the batch size
    
    #L2
    model.add(layers.Dense(GEN_L2_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L2_DENSE_SIZE, 1)
    
    #L3
    model.add(layers.Dense(GEN_L3_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L3_DENSE_SIZE, 1)
    
    return model

Define function for constructing discriminator

In [16]:
def create_discriminator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(DIS_L1_DENSE_SIZE, use_bias=False, input_shape=(DIS_INPUT_SIZE,)))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L2
    model.add(layers.Dense(DIS_L2_DENSE_SIZE, use_bias=False))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L3
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    
    return model

Define the noise generation function

In [17]:
def gen_noise():
    # Create some random noise for the generator
    n_noise = tf.random.normal([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], mean=0.0, stddev=NOISE_STDEV)
    p_noise = tf.random.poisson([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], lam=POISSON_LAM)
    noise = tf.abs(n_noise + p_noise)
    return noise

Define the loss functions

In [18]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [19]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss
    
    #total_loss = tf.reduce_mean(real_output) - tf.reduce_mean(fake_output)
    #return total_loss

In [20]:
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)
    #total_loss = -tf.reduce_mean(fake_output)
    #return total_loss

## Define the training loops

In [21]:
# Input is a batch of real cell profiles from the training set
# @tf.function
def train_step(cell_profiles):
    noise = gen_noise()
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_profiles = generator(noise, training=True)
        
        real_output = discriminator(cell_profiles, training=True)
        fake_output = discriminator(generated_profiles, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
        met_gen_loss(gen_loss)
        met_disc_loss(disc_loss)
        
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    
    return

## Create GANN model

Create generator and discriminator

In [22]:
generator = create_generator()
discriminator = create_discriminator()

Define optimizer

In [23]:
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

## Generate from test data to check network

In [24]:
noise = gen_noise()
generated_profile = generator(noise, training=False)
print(generated_profile.shape)
print(generated_profile.numpy().min())
print(generated_profile.numpy().max())

decision = discriminator(generated_profile)
print(decision.shape)
#print(decision.numpy())

(10, 19248)
-0.14763017
0.4243678
(10, 1)


## Train the GANN

Define tensorboard metrics

In [25]:
met_gen_loss = tf.keras.metrics.Mean('gen_loss', dtype=tf.float32)
met_disc_loss = tf.keras.metrics.Mean('disc_loss', dtype=tf.float32)

Create log directories

In [26]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")
train_log_dir = 'logs/gradient_tape/' + current_time + '/train'
train_summary_writer = tf.summary.create_file_writer(train_log_dir)

Run the training model

In [None]:
print('Running...')

for epoch in range(EPOCHS):
    start = time.time()
    
    #Train the epoch
    for data_batch in train_dataset:
        train_step(data_batch)
    
    #Log metrics
    with train_summary_writer.as_default():
        tf.summary.scalar('gen_loss', met_gen_loss.result(), step=epoch)
        tf.summary.scalar('disc_loss', met_disc_loss.result(), step=epoch)
    
    #Do some basic time logging
    if (epoch + 1) % 10 == 0:
        print ('Time for epoch {} is {} sec.'.format(epoch + 1, time.time()-start))
    else:
        time.time()
    
    #Log stats
    template = 'Epoch {}, Gen_loss: {}, Disc_loss: {}'
    print (template.format(epoch+1,
                           met_gen_loss.result(), 
                           met_disc_loss.result()))
    
    # Reset metrics every epoch
    met_gen_loss.reset_states()
    met_disc_loss.reset_states()

Running...
Epoch 1, Gen_loss: 26.995113372802734, Disc_loss: 13.918001174926758
Epoch 2, Gen_loss: 139.33612060546875, Disc_loss: 25.888126373291016
Epoch 3, Gen_loss: 8845.865234375, Disc_loss: 977.6671752929688
Epoch 4, Gen_loss: 5823.51025390625, Disc_loss: 391.9749450683594
Epoch 5, Gen_loss: 186.37045288085938, Disc_loss: 11.700782775878906
Epoch 6, Gen_loss: 325.5460510253906, Disc_loss: 23.696300506591797
Epoch 7, Gen_loss: 48.48832321166992, Disc_loss: 4.544403076171875
Epoch 8, Gen_loss: 142.6480712890625, Disc_loss: 4.089254856109619
Epoch 9, Gen_loss: 61.97325134277344, Disc_loss: 3.2251739501953125
Time for epoch 10 is 10.437034845352173 sec.
Epoch 10, Gen_loss: 43.58432388305664, Disc_loss: 2.6628577709198
Epoch 11, Gen_loss: 22.50831413269043, Disc_loss: 3.883500337600708
Epoch 12, Gen_loss: 33.06168746948242, Disc_loss: 2.6860525608062744
Epoch 13, Gen_loss: 110.39103698730469, Disc_loss: 7.737926959991455
Epoch 14, Gen_loss: 55.52409362792969, Disc_loss: 5.7330121994018

Start tensorboard

In [None]:
#%tensorboard --logdir {train_log_dir} --host localhost --port 6006