# GANN - CHECK SPELLING

## Setup

Load modules

In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/ 

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import time
import os
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

Setup paths

In [3]:
data_path = '../data'

train_feature_path = data_path + '/tpm_combined.csv'
train_gene_name_path = data_path + '/tpm_combined_rows.csv'
train_cell_name_path = data_path + '/tpm_combined_cols.csv'

test_feature_path = data_path + '/tpm_combined_test.csv'
test_gene_name_path = data_path + '/tpm_combined_rows_test.csv'
test_cell_name_path = data_path + '/tpm_combined_cols_test.csv'

## Load data

Load datasets into frames and check all the shapes match up

In [4]:
df_gene_names = pd.read_csv(train_gene_name_path, header=None)
df_cell_names = pd.read_csv(train_cell_name_path, header=None)
df_training_data = pd.read_csv(train_feature_path, header=None)

print(df_gene_names.shape)
print(df_cell_names.shape)
print(df_training_data.shape)

(6807, 1)
(1798, 1)
(6807, 1798)


Load test data

In [5]:
df_gene_names_test = pd.read_csv(test_gene_name_path, header=None)
df_cell_names_test = pd.read_csv(test_cell_name_path, header=None)
df_test_data = pd.read_csv(test_feature_path, header=None)

print(df_gene_names_test.shape)
print(df_cell_names_test.shape)
print(df_test_data.shape)

(6807, 1)
(500, 1)
(6807, 500)


The number of genes in the input dataset determines the generator output as well as the dicriminator inputs

In [6]:
num_genes = df_gene_names.shape[0]
df_gene_names.shape

(6807, 1)

Take a look at the training data

In [7]:
df_training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797
0,0.437635,0.422952,0.500639,0.460063,0.000000,0.625194,0.636415,0.000000,0.000000,0.621385,...,0.000000,0.806046,0.461961,0.533844,0.691345,0.000000,0.479120,0.373748,0.779602,0.652176
1,0.000000,0.330974,0.599975,0.538749,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.806895,0.000000,0.000000,0.765089,0.000000,0.501173,0.315232,0.255972,0.528174
2,0.423527,0.412909,0.488703,0.000000,0.602726,0.522253,0.000000,0.541224,0.000000,0.000000,...,0.440998,0.837320,0.353001,0.536904,0.983997,0.000000,0.000000,0.614253,0.823296,0.000000
3,0.000000,0.379869,0.381094,0.137787,0.309987,0.571913,0.548388,0.384074,0.455198,0.469107,...,0.428848,0.816352,0.531461,0.244717,0.813174,0.000000,0.327914,0.048955,0.729328,0.346599
4,0.000000,0.666779,0.565507,0.350199,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.171824,0.000000,0.561451,0.339109,0.000000,0.000000,0.652654,0.036969,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,0.000000,0.000000,0.158096,0.404081,0.000000,0.000000,0.000000,0.720074,0.000000,0.490125,...,0.000000,0.609758,0.521244,0.000000,0.297598,0.000000,0.000000,0.525897,0.672412,0.000000
6803,0.000000,0.411294,0.481663,0.346585,0.315634,0.000000,0.307813,0.399935,0.519273,0.468670,...,0.541795,0.634956,0.370401,0.348183,0.443963,0.396523,0.646611,0.477193,0.585492,0.497864
6804,0.307292,0.582062,0.466408,0.000000,0.328192,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.574837,0.000000,0.592147,0.534678,0.000000,0.000000,0.584783,0.670761,0.000000
6805,0.000000,0.659579,0.540393,0.564257,0.000000,0.000000,0.000000,0.666009,0.000000,0.000000,...,0.000000,0.642897,0.555258,0.136405,0.577916,0.000000,0.459978,0.665896,0.659554,0.000000


## Define model variables - COMMENT ON EACH ONE TO DESCRIBE

In [26]:
# Model params
LATENT_VARIABLE_SIZE = 100
GEN_L1_DENSE_SIZE = 600
GEN_L2_DENSE_SIZE = 600
GEN_L3_DENSE_SIZE = num_genes

DIS_INPUT_SIZE = num_genes
DIS_L1_DENSE_SIZE = 200
DIS_L2_DENSE_SIZE = 200

NOISE_STDEV = 0.1
POISSON_LAM = 1

# Training params
TRAIN_BATCH_SIZE = 10
TRAIN_BUFFER_SIZE = 10000
TEST_BATCH_SIZE = 500
TEST_BUFFER_SIZE = 500
GEN_BATCH_SIZE = 10
EPOCHS = 30
LEARNING_RATE = 0.001

## Create training and test datasets

Create tensors from training data - Convert to Int32 for better work on GPU with batch and shuffle

In [9]:
train_dataset = tf.data.Dataset.from_tensor_slices(df_training_data.T.values.astype('float32')).shuffle(TRAIN_BUFFER_SIZE).batch(TRAIN_BATCH_SIZE)
print(train_dataset)

<BatchDataset shapes: (None, 6807), types: tf.float32>


In [10]:
test_dataset = tf.data.Dataset.from_tensor_slices(df_test_data.T.values.astype('float32')).shuffle(TEST_BUFFER_SIZE).batch(TEST_BATCH_SIZE)
print(test_dataset)

<BatchDataset shapes: (None, 6807), types: tf.float32>


## Define GANN model

Define function for contructing the generator

In [11]:
def create_generator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(GEN_L1_DENSE_SIZE, use_bias=False, input_shape=(LATENT_VARIABLE_SIZE,)))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L1_DENSE_SIZE, 1)  # Note: None is the batch size
    
    #L2
    model.add(layers.Dense(GEN_L2_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L2_DENSE_SIZE, 1)
    
    #L3
    model.add(layers.Dense(GEN_L3_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L3_DENSE_SIZE, 1)
    
    return model

Define function for constructing discriminator

In [12]:
def create_discriminator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(DIS_L1_DENSE_SIZE, use_bias=False, input_shape=(DIS_INPUT_SIZE,)))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L2
    model.add(layers.Dense(DIS_L2_DENSE_SIZE, use_bias=False))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L3
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    
    return model

Define the noise generation function

In [13]:
def gen_noise():
    # Create some random noise for the generator
    n_noise = tf.random.normal([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], mean=0.0, stddev=NOISE_STDEV)
    p_noise = tf.random.poisson([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], lam=POISSON_LAM)
    noise = tf.abs(n_noise + p_noise)
    return noise

Define the loss functions

In [14]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [15]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss
    
    #total_loss = tf.reduce_mean(real_output) - tf.reduce_mean(fake_output)
    #return total_loss

In [16]:
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)
    #total_loss = -tf.reduce_mean(fake_output)
    #return total_loss

## Define the training loops

In [17]:
# Input is a batch of real cell profiles from the training set
# @tf.function
def train_step(cell_profiles):
    noise = gen_noise()
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_profiles = generator(noise, training=True)
        
        real_output = discriminator(cell_profiles, training=True)
        fake_output = discriminator(generated_profiles, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
        met_gen_loss(gen_loss)
        met_disc_loss(disc_loss)
        
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    
    return

## Create GANN model

Create generator and discriminator

In [18]:
generator = create_generator()
discriminator = create_discriminator()

Define optimizer

In [19]:
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

## Create checkpoints

In [20]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

## Generate from test data to check network

In [21]:
noise = gen_noise()
generated_profile = generator(noise, training=False)
print(generated_profile.shape)
print(generated_profile.numpy().min())
print(generated_profile.numpy().max())

decision = discriminator(generated_profile)
#print(decision.shape)
print(decision.numpy())

(10, 6807)
-0.22056821
0.6908798
[[0.01891878]
 [0.06468181]
 [0.3347569 ]
 [0.08462244]
 [0.16778599]
 [0.04435993]
 [0.0873422 ]
 [0.12802176]
 [0.04316174]
 [0.16580933]]


## Train the GANN

Define tensorboard metrics

In [22]:
met_gen_loss = tf.keras.metrics.Mean('gen_loss', dtype=tf.float32)
met_disc_loss = tf.keras.metrics.Mean('disc_loss', dtype=tf.float32)
met_test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)

Create log directories

In [23]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

gen_log_dir = 'logs/gradient_tape/' + current_time + '/gen_train'
disc_log_dir = 'logs/gradient_tape/' + current_time + '/disc_train'
test_log_dir = 'logs/gradient_tape/' + current_time + '/disc_test'
all_log_dir = 'logs/gradient_tape/' + current_time + '/all'

all_summary_writer = tf.summary.create_file_writer(all_log_dir)
gen_summary_writer = tf.summary.create_file_writer(gen_log_dir)
disc_summary_writer = tf.summary.create_file_writer(disc_log_dir)
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

Run the training model

In [24]:
print('Running...')

for epoch in range(EPOCHS):
    start = time.time()
    
    #Train the epoch
    for data_batch in train_dataset:
        train_step(data_batch)
        
    #Run test data through discriminator
    for data_batch in test_dataset:
        test_decision = discriminator(data_batch, training=False)

    test_loss = cross_entropy(tf.ones_like(test_decision), test_decision)
    met_test_loss(test_loss)
    
    #Log metrics
    with all_summary_writer.as_default():
        tf.summary.scalar('2_gen_loss', met_gen_loss.result(), step=epoch)
        tf.summary.scalar('3_disc_loss', met_disc_loss.result(), step=epoch)
        tf.summary.scalar('3_test_loss', met_test_loss.result(), step=epoch)
    
    with gen_summary_writer.as_default():
        tf.summary.scalar('1_loss', met_gen_loss.result(), step=epoch)
           
    with disc_summary_writer.as_default():
        tf.summary.scalar('1_loss', met_disc_loss.result(), step=epoch)
    
    with test_summary_writer.as_default():
        tf.summary.scalar('1_loss', met_test_loss.result(), step=epoch)
    
    #Do some basic time logging
    if (epoch + 1) % 10 == 0:
        print ('Time for epoch {} is {} sec.'.format(epoch + 1, time.time()-start))
        checkpoint.save(file_prefix = checkpoint_prefix)
    else:
        time.time()
    
    #Log stats
    template = 'Epoch {}, Gen_loss: {}, Disc_loss: {}, Test_loss: {}'
    print (template.format(epoch+1,
                           met_gen_loss.result(), 
                           met_disc_loss.result(),
                           met_test_loss.result()))
    
    # Reset metrics every epoch
    met_gen_loss.reset_states()
    met_disc_loss.reset_states()
    met_test_loss.reset_states()

Running...
Epoch 1, Gen_loss: 1192.5345458984375, Disc_loss: 111.31814575195312, Test_loss: 26.741315841674805
Epoch 2, Gen_loss: 8.587278366088867, Disc_loss: 13.908161163330078, Test_loss: 0.8916416764259338
Epoch 3, Gen_loss: 11.624980926513672, Disc_loss: 4.125429630279541, Test_loss: 1.1865520477294922
Epoch 4, Gen_loss: 30.56155014038086, Disc_loss: 2.6224136352539062, Test_loss: 1.1014779806137085
Epoch 5, Gen_loss: 32.0594482421875, Disc_loss: 2.7788422107696533, Test_loss: 0.3355412185192108
Epoch 6, Gen_loss: 19.79607582092285, Disc_loss: 0.7746967673301697, Test_loss: 0.27555131912231445
Epoch 7, Gen_loss: 16.76173973083496, Disc_loss: 0.762294590473175, Test_loss: 0.41003280878067017
Epoch 8, Gen_loss: 7.910008430480957, Disc_loss: 0.6947462558746338, Test_loss: 0.8840394020080566
Epoch 9, Gen_loss: 19.745668411254883, Disc_loss: 0.8389370441436768, Test_loss: 0.7124609351158142
Time for epoch 10 is 11.10207462310791 sec.
Epoch 10, Gen_loss: 11.881868362426758, Disc_loss: 0

Start tensorboard

In [25]:
#%tensorboard --logdir {train_log_dir} --host localhost --port 6006