# GANN - CHECK SPELLING

## Setup

Load modules

In [1]:
# Load the TensorBoard notebook extension
%load_ext tensorboard

# Clear any logs from previous runs
!rm -rf ./logs/ 

In [2]:
from __future__ import absolute_import, division, print_function, unicode_literals

import time
import os
import datetime
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

Setup paths

In [3]:
data_path = '../data'

train_feature_path = data_path + '/tpm_combined.csv'
train_gene_name_path = data_path + '/tpm_combined_rows.csv'
train_cell_name_path = data_path + '/tpm_combined_cols.csv'

test_feature_path = data_path + '/tpm_combined_test.csv'
test_gene_name_path = data_path + '/tpm_combined_rows_test.csv'
test_cell_name_path = data_path + '/tpm_combined_cols_test.csv'

train_nonorm_path = data_path + '/tpm_combined_train_nonorm.csv'

## Load data

Load datasets into frames and check all the shapes match up

In [4]:
df_gene_names = pd.read_csv(train_gene_name_path, header=None)
df_cell_names = pd.read_csv(train_cell_name_path, header=None)
df_training_data = pd.read_csv(train_feature_path, header=None)

df_gene_names.columns = ['gene_name']

print(df_gene_names.shape)
print(df_cell_names.shape)
print(df_training_data.shape)

(6807, 1)
(1798, 1)
(6807, 1798)


In [5]:
df_training_data_nonorm = pd.read_csv(train_nonorm_path)
df_training_data_nonorm = df_training_data_nonorm.drop('gene_name', axis=1)

nonorm_max = df_training_data_nonorm.max().max()
nonorm_min = df_training_data_nonorm.min().min()
del df_training_data_nonorm

print(nonorm_max)
print(nonorm_min)

17.828987016884007
0.0


Load test data

In [6]:
df_gene_names_test = pd.read_csv(test_gene_name_path, header=None)
df_cell_names_test = pd.read_csv(test_cell_name_path, header=None)
df_test_data = pd.read_csv(test_feature_path, header=None)

print(df_gene_names_test.shape)
print(df_cell_names_test.shape)
print(df_test_data.shape)

(6807, 1)
(500, 1)
(6807, 500)


The number of genes in the input dataset determines the generator output as well as the dicriminator inputs

In [7]:
num_genes = df_gene_names.shape[0]
df_gene_names.shape

(6807, 1)

Take a look at the training data

In [8]:
df_training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1788,1789,1790,1791,1792,1793,1794,1795,1796,1797
0,0.593519,0.724994,0.259323,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.397072,0.679657,0.541898,0.000000,0.000000,0.567361,0.442711,0.000000,0.000000
1,0.529515,0.730789,0.000000,0.291240,0.356175,0.000000,0.000000,0.252119,0.065884,0.734733,...,0.378425,0.347020,0.812647,0.487845,0.297422,0.516562,0.304545,0.488694,0.266055,0.000000
2,0.805992,0.015907,0.351573,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.551268,...,0.000000,0.661508,0.000000,0.605841,0.802402,0.681187,0.000000,0.000000,0.000000,0.000000
3,0.632745,0.442018,0.112515,0.506412,0.657094,0.628180,0.554082,0.738973,0.764555,0.726907,...,0.608894,0.586006,0.696437,0.427761,0.681078,0.672670,0.536335,0.451429,0.675192,0.527422
4,0.681206,0.236664,0.428852,0.420482,0.000000,0.000000,0.000000,0.000000,0.267296,0.625472,...,0.423746,0.111500,0.000000,0.622586,0.000000,0.611686,0.000000,0.502279,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6802,0.395655,0.749851,0.385655,0.000000,0.483893,0.697602,0.000000,0.000000,0.000000,0.747306,...,0.000000,0.455800,0.000000,0.420597,0.000000,0.407697,0.830038,0.741090,0.606057,0.000000
6803,0.000000,0.384173,0.265556,0.000000,0.000000,0.000000,0.000000,0.000000,0.444041,0.748770,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.557804,0.859781,0.868549,0.625063,0.571023
6804,0.430313,0.090124,0.605318,0.000000,0.000000,0.000000,0.000000,0.000000,0.520718,0.162545,...,0.000000,0.680744,0.128438,0.432505,0.656399,0.424025,0.275454,0.148875,0.000000,0.000000
6805,0.000000,0.000000,0.000000,0.621492,0.000000,0.649158,0.808326,0.000000,0.589434,0.000000,...,0.625170,0.000000,0.000000,0.000000,0.000000,0.491346,0.146496,0.000000,0.000000,0.000000


## Define model variables - COMMENT ON EACH ONE TO DESCRIBE

In [9]:
# Model params
LATENT_VARIABLE_SIZE = 100
GEN_L1_DENSE_SIZE = 600
GEN_L2_DENSE_SIZE = 600
GEN_L3_DENSE_SIZE = num_genes

DIS_INPUT_SIZE = num_genes
DIS_L1_DENSE_SIZE = 200
DIS_L2_DENSE_SIZE = 200

NOISE_STDEV = 0.1
POISSON_LAM = 1

# Training params
TRAIN_BATCH_SIZE = 10
TRAIN_BUFFER_SIZE = 10000
TEST_BATCH_SIZE = 500
TEST_BUFFER_SIZE = 500
GEN_BATCH_SIZE = 10
EPOCHS = 1000

EX_GEN_BATCH_SIZE = 500

#LEARNING_RATE = 0.001
LEARNING_RATE = 1e-5

## Create training and test datasets

Create tensors from training data - Convert to Int32 for better work on GPU with batch and shuffle

In [10]:
train_dataset = tf.data.Dataset.from_tensor_slices(df_training_data.T.values.astype('float32')).shuffle(TRAIN_BUFFER_SIZE).batch(TRAIN_BATCH_SIZE)
print(train_dataset)

<BatchDataset shapes: (None, 6807), types: tf.float32>


In [11]:
test_dataset = tf.data.Dataset.from_tensor_slices(df_test_data.T.values.astype('float32')).shuffle(TEST_BUFFER_SIZE).batch(TEST_BATCH_SIZE)
print(test_dataset)

<BatchDataset shapes: (None, 6807), types: tf.float32>


## Define GANN model

Define function for contructing the generator

In [12]:
def create_generator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(GEN_L1_DENSE_SIZE, use_bias=False, input_shape=(LATENT_VARIABLE_SIZE,)))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L1_DENSE_SIZE, 1)  # Note: None is the batch size
    
    #L2
    model.add(layers.Dense(GEN_L2_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L2_DENSE_SIZE, 1)
    
    #L3
    model.add(layers.Dense(GEN_L3_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L3_DENSE_SIZE, 1)
    
    return model

Define function for constructing discriminator

In [13]:
def create_discriminator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(DIS_L1_DENSE_SIZE, use_bias=False, input_shape=(DIS_INPUT_SIZE,)))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L2
    model.add(layers.Dense(DIS_L2_DENSE_SIZE, use_bias=False))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L3
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    
    return model

Define the noise generation function

In [14]:
def gen_noise(batch_size):
    # Create some random noise for the generator
    n_noise = tf.random.normal([batch_size, LATENT_VARIABLE_SIZE], mean=0.0, stddev=NOISE_STDEV)
    p_noise = tf.random.poisson([batch_size, LATENT_VARIABLE_SIZE], lam=POISSON_LAM)
    noise = tf.abs(n_noise + p_noise)
    return noise

Define the loss functions

In [15]:
# This method returns a helper function to compute cross entropy loss
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)

In [16]:
def discriminator_loss(real_output, fake_output):
    real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    total_loss = real_loss + fake_loss
    return total_loss
    
    #total_loss = tf.reduce_mean(real_output) - tf.reduce_mean(fake_output)
    #return total_loss

In [17]:
def generator_loss(fake_output):
    return cross_entropy(tf.ones_like(fake_output), fake_output)
    #total_loss = -tf.reduce_mean(fake_output)
    #return total_loss

Util functions

In [18]:
def data_frame_from_gen(profile, label):
    # Create formatted dataframe from generator result
    df_gen_prof = pd.DataFrame(generated_profile.numpy()).T
    df_gen_prof = df_gene_names.join(df_gen_prof, lsuffix='', rsuffix='', how='inner')
    df_gen_prof.index = df_gen_prof.gene_name
    df_gen_prof = df_gen_prof.drop('gene_name', axis=1)
    df_gen_prof = df_gen_prof.add_prefix(label)

    # Get limits
    gen_min = df_gen_prof.min().min()
    gen_max = df_gen_prof.max().max()

    # Scale everything up to 0
    df_gen_prof = df_gen_prof + (gen_min*-1)
    gen_max = df_gen_prof.max().max()
    gen_min = df_gen_prof.min().min()

    # Rescale to between real world min maxes
    df_gen_prof = df_gen_prof / gen_max
    df_gen_prof = df_gen_prof * nonorm_max
    
    return df_gen_prof

## Define the training loops

In [19]:
# Input is a batch of real cell profiles from the training set
# @tf.function
def train_step(cell_profiles):
    noise = gen_noise(GEN_BATCH_SIZE)
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_profiles = generator(noise, training=True)
        
        real_output = discriminator(cell_profiles, training=True)
        fake_output = discriminator(generated_profiles, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
        met_gen_loss(gen_loss)
        met_disc_loss(disc_loss)
        
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
    
    return

## Create GANN model

Create generator and discriminator

In [20]:
generator = create_generator()
discriminator = create_discriminator()

Define optimizer

In [21]:
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

## Create checkpoints

In [22]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(generator_optimizer=generator_optimizer,
                                 discriminator_optimizer=discriminator_optimizer,
                                 generator=generator,
                                 discriminator=discriminator)

## Generate from test data to check network

In [23]:
# Generate a single test set
noise = gen_noise(EX_GEN_BATCH_SIZE)
generated_profile = generator(noise, training=False)
df_gen_prof_1 = data_frame_from_gen(generated_profile, 'gencell_ep0_')

# Vis
df_gen_prof_1

# Save to file
#df_gen_prof_1.to_csv(data_path + '/gen_prof_pre.csv')

Unnamed: 0_level_0,gencell_ep0_0,gencell_ep0_1,gencell_ep0_2,gencell_ep0_3,gencell_ep0_4,gencell_ep0_5,gencell_ep0_6,gencell_ep0_7,gencell_ep0_8,gencell_ep0_9,...,gencell_ep0_490,gencell_ep0_491,gencell_ep0_492,gencell_ep0_493,gencell_ep0_494,gencell_ep0_495,gencell_ep0_496,gencell_ep0_497,gencell_ep0_498,gencell_ep0_499
gene_name,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Rfc3,7.200466,5.575809,12.312920,6.961092,5.385093,4.401385,3.806179,3.407456,4.484842,6.373935,...,4.687148,5.135033,4.271653,4.040664,4.002931,6.974885,7.373641,6.313165,4.716501,6.181706
Cd47,8.052140,7.339159,8.263481,6.745177,5.466942,6.188019,7.103613,9.825594,8.034950,6.630358,...,7.174108,8.182070,9.142796,8.125615,7.295401,7.466935,7.011714,7.556933,7.607570,5.677191
Elmo2,3.237847,3.383047,3.637261,3.051248,5.128806,2.605473,3.961656,4.540101,2.977164,5.189309,...,3.335412,3.036548,4.590815,4.364906,3.108225,3.692504,2.766385,3.138972,3.358127,4.026422
Crip2,2.770261,2.115549,3.748849,3.183826,3.041842,2.651016,3.118462,2.995350,2.939400,3.137265,...,3.100469,2.801795,2.937439,3.132435,2.637694,2.486814,2.349024,3.635562,3.221894,3.532440
Pprc1,5.994354,5.729683,5.641799,4.600624,2.712652,4.404141,5.381550,5.337330,4.834723,3.129318,...,2.956130,8.722386,3.236373,5.171160,4.491530,4.921404,4.775207,4.837671,3.328127,3.128874
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Hprt,4.051755,4.328464,2.921821,2.985272,3.123294,3.141180,3.251162,5.207484,3.314775,3.151452,...,3.210772,2.763375,4.190492,2.946907,2.913033,2.883234,2.293713,2.466670,2.934083,3.647069
Atraid,5.513589,7.004598,3.636117,5.362860,7.248859,4.891181,3.454343,3.712314,5.088841,3.947650,...,3.895094,6.636585,5.725394,5.100912,3.328512,3.011475,5.242088,5.681060,5.035285,5.009675
Chek2,4.779669,3.237050,4.097553,3.271571,3.290487,2.927680,2.616612,2.800702,4.551406,3.382791,...,2.780929,3.065887,2.927438,3.475044,2.579170,2.624908,3.021105,2.682960,2.880622,3.159295
Pink1,3.337079,5.231534,3.082321,4.477653,5.481628,2.865719,3.179046,3.526146,6.183165,3.112204,...,4.816295,3.476117,6.467189,2.936959,3.918249,3.364490,5.114668,3.158574,3.159046,4.901445


## Train the GANN

Define tensorboard metrics

In [24]:
met_gen_loss = tf.keras.metrics.Mean('gen_loss', dtype=tf.float32)
met_disc_loss = tf.keras.metrics.Mean('disc_loss', dtype=tf.float32)
met_test_loss = tf.keras.metrics.Mean('test_loss', dtype=tf.float32)

Create log directories

In [25]:
current_time = datetime.datetime.now().strftime("%Y%m%d-%H%M%S")

gen_log_dir = 'logs/gradient_tape/' + current_time + '/gen_train'
disc_log_dir = 'logs/gradient_tape/' + current_time + '/disc_train'
test_log_dir = 'logs/gradient_tape/' + current_time + '/disc_test'
all_log_dir = 'logs/gradient_tape/' + current_time + '/all'

all_summary_writer = tf.summary.create_file_writer(all_log_dir)
gen_summary_writer = tf.summary.create_file_writer(gen_log_dir)
disc_summary_writer = tf.summary.create_file_writer(disc_log_dir)
test_summary_writer = tf.summary.create_file_writer(test_log_dir)

Run the training model

In [26]:
print('Running...')

for epoch in range(EPOCHS):
    
    # Save checkpoints and gen example data
    if epoch % 10 == 0:   
        checkpoint.save(file_prefix = checkpoint_prefix)
        
        # Generate a profile set
        noise = gen_noise(EX_GEN_BATCH_SIZE)
        generated_profile = generator(noise, training=False)
        df_gen_prof = data_frame_from_gen(generated_profile, 'gencell_ep' + str(epoch) + '_')
        df_gen_prof.to_csv(data_path + '/gen_prof_' + str(epoch) + '.csv')
    
    # Logging
    start = time.time()
    
    #Train the epoch
    for data_batch in train_dataset:
        train_step(data_batch)
        
    #Run test data through discriminator
    for data_batch in test_dataset:
        test_decision = discriminator(data_batch, training=False)

    test_loss = cross_entropy(tf.ones_like(test_decision), test_decision)
    met_test_loss(test_loss)
    
    #Log metrics
    with all_summary_writer.as_default():
        tf.summary.scalar('2_gen_loss', met_gen_loss.result(), step=epoch)
        tf.summary.scalar('3_disc_loss', met_disc_loss.result(), step=epoch)
        tf.summary.scalar('3_test_loss', met_test_loss.result(), step=epoch)
    
    with gen_summary_writer.as_default():
        tf.summary.scalar('1_loss', met_gen_loss.result(), step=epoch)
           
    with disc_summary_writer.as_default():
        tf.summary.scalar('1_loss', met_disc_loss.result(), step=epoch)
    
    with test_summary_writer.as_default():
        tf.summary.scalar('1_loss', met_test_loss.result(), step=epoch)

    # Logging
    #print ('Time for epoch {} is {} sec.'.format(epoch + 1, time.time()-start))
    time.time()
      
    #Log stats
    template = 'Epoch {}, Gen_loss: {}, Disc_loss: {}, Test_loss: {}'
    print (template.format(epoch+1,
                           met_gen_loss.result(), 
                           met_disc_loss.result(),
                           met_test_loss.result()))
    
    # Reset metrics every epoch
    met_gen_loss.reset_states()
    met_disc_loss.reset_states()
    met_test_loss.reset_states()
    
# Generate a profile set
noise = gen_noise(EX_GEN_BATCH_SIZE)
generated_profile = generator(noise, training=False)
df_gen_prof = data_frame_from_gen(generated_profile, 'gencell_ep' + str(EPOCHS) + '_')
df_gen_prof.to_csv(data_path + '/gen_prof_' + str(EPOCHS) + '.csv')

Running...
Epoch 1, Gen_loss: 0.23448997735977173, Disc_loss: 1.8714780807495117, Test_loss: 0.17600056529045105
Epoch 2, Gen_loss: 0.24080656468868256, Disc_loss: 1.8279056549072266, Test_loss: 0.48494765162467957
Epoch 3, Gen_loss: 0.32508137822151184, Disc_loss: 2.1076934337615967, Test_loss: 0.941792905330658
Epoch 4, Gen_loss: 0.4572354257106781, Disc_loss: 2.0163793563842773, Test_loss: 0.9000186324119568
Epoch 5, Gen_loss: 0.5963371992111206, Disc_loss: 1.6074954271316528, Test_loss: 0.7591283321380615
Epoch 6, Gen_loss: 0.5932952761650085, Disc_loss: 1.4658998250961304, Test_loss: 0.5436370372772217
Epoch 7, Gen_loss: 0.7057238221168518, Disc_loss: 1.2379900217056274, Test_loss: 0.576341986656189
Epoch 8, Gen_loss: 0.8157938718795776, Disc_loss: 1.1121573448181152, Test_loss: 0.511326789855957
Epoch 9, Gen_loss: 0.8733988404273987, Disc_loss: 1.1379355192184448, Test_loss: 0.6946274638175964
Epoch 10, Gen_loss: 0.8835011124610901, Disc_loss: 1.3008828163146973, Test_loss: 0.785

KeyboardInterrupt: 

Start tensorboard

In [None]:
#%tensorboard --logdir {train_log_dir} --host localhost --port 6006