In [None]:
import keras
from keras.models import Sequential, Model
from keras.layers import Input, Dense, Activation, Lambda
from keras.callbacks import EarlyStopping
from keras import backend as K
from keras import metrics
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import cm
from collections import Counter

In [None]:
data = np.load('../data/fluxes_ecoli_biomass.npy')
data_no_nan = np.nan_to_num(x=data)
data.shape

In [None]:
flat_data = np.reshape(data_no_nan, (data.shape[0] * data.shape[1], data.shape[2]))

In [None]:
y = np.array(range(41) * data.shape[0])
y.shape

In [None]:
np.random.seed(seed=42)
train_ind = np.random.choice(flat_data.shape[0], size=int(0.9 * flat_data.shape[0]), replace=False)
test_ind = list(set(range(flat_data.shape[0])) - set(train_ind))

In [None]:
X_train, y_train = flat_data[train_ind], y[train_ind]
X_test, y_test = flat_data[test_ind], y[test_ind]

In [None]:
def build_ae(X_shape):
    encoding_sz = 100
    input_lay = Input(shape=(X_shape,))
    encoded = Dense(2 * encoding_sz, activation='relu')(input_lay)
    encoded = Dense(encoding_sz, activation='relu')(encoded)

    decoded = Dense(2 * encoding_sz, activation='relu')(encoded)
    decoded = Dense(X_shape, activation='sigmoid')(decoded)
    
    ae = Model(input_lay, decoded)
    encoder = Model(input_lay, encoded)
    encoded_input = Input(shape=(2 * encoding_sz,))
    decoder_layer = ae.layers[-1]
    decoder = Model(encoded_input, decoder_layer(encoded_input))
    
    ae.compile(optimizer='adadelta', loss='mean_squared_error')
    return ae, encoder, decoder

In [None]:
autoencoder, encoder, decoder = build_ae(X_train.shape[1])
autoencoder.fit(X_train, X_train,
                epochs=30,
                batch_size=256,
                shuffle=True,
                validation_data=(X_test, X_test))

encoded_fluxes = encoder.predict(X_test)
decoded_fluxes = decoder.predict(encoded_fluxes)

In [None]:
min_val = np.min(X_train)
max_val = np.max(X_train)
scale = lambda x: (x + min_val) / max_val

In [None]:
X_scale = scale(X_train)
X_test_scale = scale(X_test)

In [None]:
encoded_dim1 = 512
encoded_sz = 256
latent_dim = 2
epsilon_std = 1.0
X_shape = 2500
# Encoder network
x = Input(shape=(X_shape,))
h = Dense(encoded_dim1, activation='relu')(x)
h = Dense(encoded_sz, activation='relu')(h)
z_mean = Dense(latent_dim)(h)
z_log_var = Dense(latent_dim)(h)

# Sample points from latent space
z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])

# Decoder network
decoder_h = Dense(encoded_sz, activation='relu')
decoder_h2 = Dense(encoded_dim1, activation='relu')
decoder_mean = Dense(X_shape, activation='sigmoid')
h_decoded = decoder_h(z)
h_decoded2 = decoder_h2(h_decoded)
x_decoded_mean = decoder_mean(h_decoded2)

# end-to-end autoencoder
vae = Model(x, x_decoded_mean)

xent_loss = X_shape * metrics.mean_squared_error(x, x_decoded_mean)
kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
vae_loss = K.mean(xent_loss + kl_loss)
vae.add_loss(vae_loss)
vae.compile(optimizer='rmsprop')

In [None]:
print xent_loss
print kl_loss
print vae_loss

In [None]:
def sampling(args):
    z_mean, z_log_var = args
    epsilon = K.random_normal(shape=(K.shape(z_mean)[0], latent_dim), mean=0.,
                              stddev=epsilon_std)
    return z_mean + K.exp(z_log_var / 2) * epsilon


def build_vae(X_shape, batch_size=100):
    encoded_dim1 = 512
    encoded_sz = 256
    # Encoder network
    x = Input(shape=(X_shape,))
    h = Dense(encoded_dim1, activation='relu')(x)
    h = Dense(encoded_sz, activation='relu')(h)
    z_mean = Dense(latent_dim)(h)
    z_log_var = Dense(latent_dim)(h)
    
    # Sample points from latent space
    z = Lambda(sampling, output_shape=(latent_dim,))([z_mean, z_log_var])
    
    # Decoder network
    decoder_h = Dense(encoded_sz, activation='relu')
    decoder_h2 = Dense(encoded_dim1, activation='relu')
    decoder_mean = Dense(X_shape, activation='sigmoid')
    h_decoded = decoder_h(z)
    h_decoded2 = decoder_h2(h_decoded)
    x_decoded_mean = decoder_mean(h_decoded2)

    # end-to-end autoencoder
    vae = Model(x, x_decoded_mean)
    
    xent_loss = X_shape * metrics.mean_squared_error(x, x_decoded_mean)
    kl_loss = - 0.5 * K.sum(1 + z_log_var - K.square(z_mean) - K.exp(z_log_var), axis=-1)
    vae_loss = K.mean(xent_loss + kl_loss)
    vae.add_loss(vae_loss)
    vae.compile(optimizer='rmsprop')
    #vae.summary()

    # encoder, from inputs to latent space
    encoder = Model(x, z_mean)

    # generator, from latent space to reconstructed inputs
    decoder_input = Input(shape=(latent_dim,))
    _h_decoded = decoder_h(decoder_input)
    _h_decoded2 = decoder_h2(_h_decoded)
    _x_decoded_mean = decoder_mean(_h_decoded2)
    generator = Model(decoder_input, _x_decoded_mean)
    return vae, encoder, generator

In [None]:
%%debug
latent_dim = 2
batch_size = 256
epsilon_std = 1.0
vae, encoder, generator = build_vae(X_scale.shape[1], batch_size)
es = EarlyStopping(patience=2)
vae.fit(X_scale,
        shuffle=True,
        epochs=10,
        batch_size=batch_size,
        validation_data=(X_test_scale, None),
        callbacks=[es])

In [None]:
x_test_encoded = encoder.predict(X_test_scale, batch_size=batch_size)

In [None]:
x_test_encoded

In [None]:
def get_rct(df, rct, y_test):
    y_new = []
    for ind in y_test:
        y_new.append(df[rct][ind])
    return y_new
get_rct(df, 'Glucose', y_test)

In [None]:
#cm1 = cm.get_cmap('tab20b', 20)
#cm2 = cm.get_cmap('tab20c', 20)
cmap = cm.get_cmap('plasma', 41)
#cmap = lambda x: cm1(x) if x < 21 else cm2(x)
xmin, xmax = np.amin(x_test_encoded[:, 0]), np.amax(x_test_encoded[:, 0])
ymin, ymax = np.amin(x_test_encoded[:, 1]), np.amax(x_test_encoded[:, 1])
x_diff = (xmax - xmin) / 2.0
y_diff = (ymax - ymin) / 2.0
for col in df.columns[4:]:
    plt.figure(figsize=(10, 10))
    plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=get_rct(df, col, y_test), cmap=cmap)
    plt.xlim((xmin - x_diff, xmax + x_diff))
    plt.ylim((ymin - y_diff, ymax + y_diff))
    plt.title(col)
    plt.colorbar()
    plt.show()
    
plt.figure(figsize=(10, 10))
plt.scatter(x_test_encoded[:, 0], x_test_encoded[:, 1], c=y_test, cmap=cmap)
plt.xlim((xmin - x_diff, xmax + x_diff))
plt.ylim((ymin - y_diff, ymax + y_diff))
plt.title('Variant')
plt.colorbar()
plt.show()

In [None]:
import pandas as pd
df = pd.read_csv('../data/Karim_MetEng_2018_Figure2_Data.csv')
df.drop(columns=['Area_1', 'Area_2', 'Conc_1', 'Conc_2'], inplace=True)
df.head()

In [None]:
df.columns[3:]