# GANN - CHECK SPELLING

## Setup

Load modules

In [1]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from sklearn.preprocessing import MinMaxScaler
import time

Setup paths

In [2]:
data_path = '../data'
feature_path = data_path + '/tpm_combined.csv'
gene_name_path = data_path + '/tpm_combined_rows.csv'
cell_name_path = data_path + '/tpm_combined_cols.csv'

## Load data

Load datasets into frames and check all the shapes match up

In [None]:
df_gene_names = pd.read_csv(gene_name_path, header=None)
df_cell_names = pd.read_csv(cell_name_path, header=None)
df_training_data = pd.read_csv(feature_path, header=None)

print(df_gene_names.shape)
print(df_cell_names.shape)
print(df_training_data.shape)

The number of genes in the input dataset determines the generator output as well as the dicriminator inputs

In [None]:
num_genes = df_gene_names.shape[0]
df_gene_names.shape

Take a look at the training data

In [None]:
df_training_data.info()

In [None]:
df_training_data

Check max values

In [None]:
training_data_max = df_training_data.max()
training_data_max = training_data_max.max()
print(training_data_max)

## Pre-process training data

Normalise input data

In [None]:
np_training_data = df_training_data.T.values
scaler = MinMaxScaler()
print(scaler.fit(np_training_data))

# Check which dimension we are fitting to - if we are fitting to gene expression then should be equal to number of genes
print(scaler.data_max_.shape)

In [None]:
np_training_data_norm = np.transpose(scaler.transform(np_training_data))
np_training_data_norm.shape

Get max values for noise generation

In [None]:
training_data_max = np_training_data_norm.max()
training_data_max = training_data_max.max()
print(training_data_max)

## Define model variables - COMMENT ON EACH ONE TO DESCRIBE

In [None]:
# Model params
LATENT_VARIABLE_SIZE = 100
GEN_L1_DENSE_SIZE = 600
GEN_L2_DENSE_SIZE = 600
GEN_L3_DENSE_SIZE = num_genes

DIS_INPUT_SIZE = num_genes
DIS_L1_DENSE_SIZE = 200
DIS_L2_DENSE_SIZE = 200

NOISE_STDEV = training_data_max / 10
POISSON_LAM = 1

# Training params
TRAIN_BATCH_SIZE = 10
GEN_BATCH_SIZE = 20
BUFFER_SIZE = 10000
EPOCHS = 50
LEARNING_RATE = 0.001

In [None]:
print(NOISE_STDEV)

## Create training dataset

Create tensors from training data - Convert to Int32 for better work on GPU with batch and shuffle

In [None]:
train_dataset = tf.data.Dataset.from_tensor_slices(df_training_data.T.values.astype('float32')).shuffle(BUFFER_SIZE).batch(TRAIN_BATCH_SIZE)
print(train_dataset)

## Define GANN model

Define function for contructing the generator

In [None]:
def create_generator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(GEN_L1_DENSE_SIZE, use_bias=False, input_shape=(LATENT_VARIABLE_SIZE,)))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L1_DENSE_SIZE, 1)  # Note: None is the batch size
    
    #L2
    model.add(layers.Dense(GEN_L2_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L2_DENSE_SIZE, 1)
    
    #L3
    model.add(layers.Dense(GEN_L3_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    #assert model.output_shape == (None, GEN_L3_DENSE_SIZE, 1)
    
    return model

Define function for constructing discriminator

In [None]:
def create_discriminator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(DIS_L1_DENSE_SIZE, use_bias=False, input_shape=(DIS_INPUT_SIZE,)))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L2
    model.add(layers.Dense(DIS_L2_DENSE_SIZE, use_bias=False))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L3
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    
    return model

Define the noise generation function

In [None]:
def gen_noise():
    # Create some random noise for the generator
    n_noise = tf.random.normal([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], mean=0.0, stddev=NOISE_STDEV)
    p_noise = tf.random.poisson([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], lam=POISSON_LAM)
    noise = tf.abs(n_noise + p_noise)
    return noise

Define the loss functions

In [None]:
def discriminator_loss(real_output, fake_output):
    #real_loss = cross_entropy(tf.ones_like(real_output), real_output)
    #fake_loss = cross_entropy(tf.zeros_like(fake_output), fake_output)
    #total_loss = real_loss + fake_loss
    #return total_loss
    
    total_loss = tf.reduce_mean(real_output) - tf.reduce_mean(fake_output)
    return total_loss

In [None]:
def generator_loss(fake_output):
    #return cross_entropy(tf.ones_like(fake_output), fake_output)
    total_loss = -tf.reduce_mean(fake_output)
    return total_loss

## Define the training loops

In [None]:
# Input is a batch of real cell profiles from the training set
# @tf.function
def train_step(cell_profiles):
    noise = gen_noise()
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        generated_profiles = generator(noise, training=True)
        
        real_output = discriminator(cell_profiles, training=True)
        fake_output = discriminator(generated_profiles, training=True)

        gen_loss = generator_loss(fake_output)
        disc_loss = discriminator_loss(real_output, fake_output)
        
    gradients_of_generator = gen_tape.gradient(gen_loss, generator.trainable_variables)
    gradients_of_discriminator = disc_tape.gradient(disc_loss, discriminator.trainable_variables)

    generator_optimizer.apply_gradients(zip(gradients_of_generator, generator.trainable_variables))
    discriminator_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))

## Create GANN model

Create generator and discriminator

In [None]:
generator = create_generator()
discriminator = create_discriminator()

Define optimizer

In [None]:
generator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)
discriminator_optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE, beta_1=0.9, beta_2=0.999, epsilon=1e-07)

## Generate from test data to check network

In [None]:
noise = gen_noise()
generated_profile = generator(noise, training=False)
print(generated_profile.shape)
print(generated_profile.numpy().min())
print(generated_profile.numpy().max())

decision = discriminator(generated_profile)
print(decision.shape)
#print(decision.numpy())

## Train the GANN

In [None]:
for epoch in range(EPOCHS):
    start = time.time()
    
    print('Running...')

    for data_batch in train_dataset:
        #Iterate training
        train_step(data_batch)
        
        #Print current loss
        if (epoch + 1) % 10 == 0:
            print('epoch - ' + epoch)

    print ('Time for epoch {} is {} sec'.format(epoch + 1, time.time()-start))