# GANN - CHECK SPELLING

## Setup

Load modules

In [53]:
from __future__ import absolute_import, division, print_function, unicode_literals

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import MinMaxScaler

Setup paths

In [13]:
data_path = '../data'
feature_path = data_path + '/tpm_combined.csv'
gene_name_path = data_path + '/tpm_combined_rows.csv'
cell_name_path = data_path + '/tpm_combined_cols.csv'

## Load data

Load datasets into frames

In [30]:
df_gene_names = pd.read_csv(gene_name_path)
df_cell_names = pd.read_csv(cell_name_path)
df_training_data = pd.read_csv(feature_path, header=None)

The number of genes in the input dataset determines the generator output as well as the dicriminator inputs

In [29]:
num_genes = df_gene_names.shape[0]
num_genes

17138

Take a look at the training data

In [37]:
df_training_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17139 entries, 0 to 17138
Columns: 380 entries, 0 to 379
dtypes: float64(380)
memory usage: 49.7 MB


In [78]:
df_training_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,370,371,372,373,374,375,376,377,378,379
0,4.217231,3.003602,4.209453,0.000000,5.296824,5.300856,5.587965,3.826803,3.414136,4.888013,...,5.532005,2.744161,3.887525,4.078097,4.351911,2.073820,2.733354,4.414812,4.147307,5.111449
1,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,0.000000,3.181103,0.000000,0.000000,0.000000,1.827819,0.000000
2,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,6.821838,0.000000,0.000000,...,0.000000,2.963474,0.000000,2.361768,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000
3,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.835924,...,1.304511,0.000000,0.000000,0.000000,0.000000,0.000000,0.485427,0.000000,1.214125,0.000000
4,3.109361,5.938286,5.093391,0.000000,0.000000,2.693766,4.627607,6.537141,3.842979,2.786596,...,3.587365,4.416840,4.006298,4.531069,4.925999,0.000000,4.777683,6.796753,4.678635,3.206331
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17134,2.769772,6.381975,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,3.638074,3.882643,...,0.000000,0.000000,3.676944,0.000000,0.000000,4.687061,2.220330,0.000000,0.000000,0.000000
17135,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,4.723012,0.000000,0.000000,0.000000,...,0.000000,4.912650,0.000000,0.000000,0.000000,0.000000,3.895303,0.000000,0.000000,0.000000
17136,0.000000,5.440952,2.097611,0.000000,0.000000,3.385431,3.339137,0.000000,5.599318,0.000000,...,2.680324,1.464668,0.000000,0.000000,3.822730,5.358959,3.378512,0.000000,4.251719,0.000000
17137,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.000000,0.000000,0.000000,1.636915,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000


Check max values

In [79]:
training_data_max = df_training_data.max()
training_data_max = training_data_max.max()
print(training_data_max)

19.11425046922983


## Pre-process training data

Normalise input data

In [76]:
np_training_data = df_training_data.T.values
scaler = MinMaxScaler()
print(scaler.fit(np_training_data))

# Check which dimension we are fitting to - if we are fitting to gene expression then should be equal to number of genes
print(scaler.data_max_.shape)

MinMaxScaler(copy=True, feature_range=(0, 1))
(17139,)


In [74]:
np_training_data_norm = np.transpose(scaler.transform(np_training_data))
np_training_data_norm.shape

(17139, 380)

Get max values for noise generation

In [80]:
training_data_max = np_training_data_norm.max()
training_data_max = training_data_max.max()
print(training_data_max)

1.0000000000000002


## Define model variables - COMMENT ON EACH ONE TO DESCRIBE

In [81]:
# Model params
LATENT_VARIABLE_SIZE = 100
GEN_L1_DENSE_SIZE = 600
GEN_L2_DENSE_SIZE = 600
GEN_L3_DENSE_SIZE = num_genes

DIS_INPUT_SIZE = num_genes
DIS_L1_DENSE_SIZE = 200
DIS_L2_DENSE_SIZE = 200

NOISE_STDEV = training_data_max / 10

# Training params
TRAIN_BATCH_SIZE = 1
GEN_BATCH_SIZE = 1
BUFFER_SIZE = 1
EPOCHS = 50

In [82]:
print(NOISE_STDEV)

0.10000000000000002


## Create training dataset

Create tensors from training data - Convert to Int32 for better work on GPU with batch and shuffle

In [54]:
train_dataset = tf.data.Dataset.from_tensor_slices(df_training_data.values.astype('float32')).shuffle(BUFFER_SIZE).batch(TRAIN_BATCH_SIZE)
print(train_dataset)

<BatchDataset shapes: (None, 380), types: tf.float32>


## Define GANN model

Define function for contructing the generator

In [20]:
def create_generator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(GEN_L1_DENSE_SIZE, use_bias=False, input_shape=(LATENT_VARIABLE_SIZE,)))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    assert model.output_shape == (None, GEN_L1_DENSE_SIZE, 1)  # Note: None is the batch size
    
    #L2
    model.add(layers.Dense(GEN_L2_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    assert model.output_shape == (None, GEN_L2_DENSE_SIZE, 1)
    
    #L3
    model.add(layers.Dense(GEN_L3_DENSE_SIZE, use_bias=False))
    #model.add(layers.BatchNormalization())
    model.add(layers.LeakyReLU())
    assert model.output_shape == (None, GEN_L3_DENSE_SIZE, 1)
    
    return model

Define function for constructing discriminator

In [22]:
def create_discriminator():
    model = tf.keras.Sequential()
    
    #L1
    model.add(layers.Dense(DIS_L1_DENSE_SIZE, use_bias=False, input_shape=(DIS_INPUT_SIZE,)))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L2
    model.add(layers.Dense(DIS_L2_DENSE_SIZE, use_bias=False))
    model.add(layers.LeakyReLU())
    #model.add(layers.Dropout(0.3))
    
    #L3
    model.add(layers.Flatten())
    model.add(layers.Dense(1))
    
    return model

Define the noise generation function

In [88]:
def gen_noise():
    # Create some random noise for the generator
    n_noise = tf.random.normal([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE], mean=0.0, stddev=NOISE_STDEV)
    p_noise = tf.random.poisson([GEN_BATCH_SIZE, LATENT_VARIABLE_SIZE])
    return tf.abs(n_noise + p_noise)

Define the loss functions

In [89]:
def discriminator_loss(real_output, fake_output):
    

SyntaxError: unexpected EOF while parsing (<ipython-input-89-bf8b101fe703>, line 2)

In [90]:
def generator_loss(fake_output):

SyntaxError: unexpected EOF while parsing (<ipython-input-90-c5f045404d98>, line 1)

## Define the training loop

In [87]:
# Input is a batch of real cell profiles from the training set
# @tf.function
def train_step(cell_profiles):
    noise = gen_noise()
    
    with tf.GradientTape() as gen_tape, tf.GradientTape() as disc_tape:
        

SyntaxError: unexpected EOF while parsing (<ipython-input-87-6f8021bb3d9a>, line 7)

## Create GANN model

Create generator and discriminator

In [None]:
generator = create_generator()
discriminator = create_discriminator()

## Generate from test data to check network

## Train the GANN