In [1]:
import tensorflow as tf
from keras.layers import Dense, LSTM, Activation, Bidirectional, Dropout, Input, concatenate
from keras.models import Sequential
from keras import Model
import keras.backend as K
import rouge
from nltk.tokenize.treebank import TreebankWordDetokenizer

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
Using TensorFlow backend.


In [None]:
def squareError(xTrue, xPred):
    return K.square(xTrue - xPred)


In [None]:
def reconstructionLoss(sample, encoder, decoder, f_w, weight): # (L_1 from the paper)
    return K.mean(squareError(sample, decoder(*encoder(sample))) + 
                  weight*K.log(f_w(*encoder(sample))), axis=0)


In [None]:
def divergenceLoss(f_w, encoder, sample, z_j, n_j): # Mean of log f_w(E_theta_i(x_j)) + log (1-f_w(z_j, n_j)) from the paper (L_2).
    return K.mean(K.log(f_w(*encoder(sample))) + 
                  K.log(1 - f_w(z_j, n_j)), axis=0)


In [None]:
# Currently just doing a restriction to the last z variables, might want to do a matrix multiplication?
# pi_Z from the paper. projects a latent distribution in (z, n) to z
def projectZ(encoded):
    return encoded[0] # take zs.

In [None]:
def projectN(encoded):
    return encoded[1] # taek Ns.

In [None]:
# takes in two inputs, n and z, and outputs samples.
def createDecoder(z_dims, n_dims, time_steps, output_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?

    z_inputs = Input(shape=(z_dims))
    n_inputs = Input(shape=(n_dims))
    inputs = concatenate([z_inputs, n_inputs])
    # 150 is arbitrary rn...
    dense = Dense(150)(inputs)
    # TODO Reshape to enforce time_steps?
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=True))(dense)
    bilstm = Dropout(0.2)(bilstm)
    outputs = Bidirectional(LSTM(32, activation='tanh', dropout=0.2, return_sequences=True))(inputs)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=outputs)
    
    model.compile()
    return model

In [None]:
def createEncoder(time_steps, input_num, z_dims, n_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?
    inputs = Input(shape=(time_steps, input_num))
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=True))(bilstm)
    bilstm = Dropout(0.2)(bilstm)
    dense = Bidirectional(LSTM(32, activation='tanh', dropout=0.2, return_sequences=False))(bilstm)
    dense = Dropout(0.5)(dense)
    z_output = Dense(z_dims)(dense)
    n_output = Dense(n_dims)(dense)
    
    model = Model(inputs=inputs, outputs=[z_output, n_output])
    
    model.compile()
    return model

In [None]:
def createDiscriminator(z_dims, n_dims):
    z_inputs = Input(shape=(z_dims))
    n_inputs = Input(shape=(n_dims))
    inputs = concatenate([z_inputs, n_inputs])
    
    # 150, 100 is arbitrary rn...
    dense = Dense(150)(inputs)
    dense = Dense(100)(dense)
    output = Dense(1)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=output)
    
    model.compile()
    return model

In [None]:
enc_optimizer = tf.keras.optimizers.Adam(5e-4)
dec_optimizer = tf.keras.optimizers.Adam(5e-4)
disc_optimizer = tf.keras.optimizers.Adam(5e-4)

In [None]:
def sampleZFrom

## When $P_Z$ is known... 

In [3]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# original_domains is a list of the original domains P_z was derived from.

# Currently assuming P_Z is known. Must approximate P_Z first.
def trainAutoencodersWithPz(k, encoders, decoders, samples, discriminator, original_domains, weight=1.0):
    N = samples.shape[1]
    
    for i in range(k):
        if i not in original_domains:
            original_domain = np.random.choice(original_domains)
            encoder = encoders[i]
            decoder = decoders[i]
            original_encoder = encoders[original_domain]
            while(not isConverged(encoder, decoder)):
                p_Xi_samples = samples[i, np.random.choice(N, num_samples, replace=False),:,:]
                p_Z_samples = projectZ(original_encoder(samples[original_domain, np.random.choice(N, num_samples, replace=False),:,:]))
                p_Ni_samples = projectN(encoder(samples[i, np.random.choice(N, num_samples, replace=False),:,:]))

                with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                    reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, f_w, weight)

                    # negative b/c gradient ascent.
                    divergence_loss = -divergenceLoss(f_w, encoder, p_Xi_samples, p_Z_samples, p_Ni_samples)

                gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)


                enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
            

## When $P_Z$ is unknown...
"A straight-forward approach for learning the latent distribution PZ is to train a regularized autoencoder on data from a
single representative domain. However, such a representation could potentially capture variability that is specific to
that one domain. To learn a more invariant latent representation, we propose the following extension of our autoencoder
framework. The basic idea is to alternate between training
multiple autoencoders until they agree on a latent representation that is effective for their respective domains. This is
particularly relevant for applications to biology; for example, often one is interested in learning a latent representation
that integrates all of the data modalities."

In [4]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# domains is a list of the domains we are currently training over.

def trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, domains, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
    
    for i in domains:
        encoder = encoders[i]
        decoder = decoders[i]
        for j in domains:
            if i != j:
                j_encoder = encoders[j]
                while(not isConverged(encoder, decoder)):
                    p_Xi_samples = samples[i, np.random.choice(N, num_samples, replace=False),:,:]
                    p_Zj_samples = projectZ(j_encoder(samples[j, np.random.choice(N, num_samples, replace=False),:,:]))
                    p_Ni_samples = projectN(encoder(samples[i, np.random.choice(N, num_samples, replace=False),:,:]))

                    with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                        reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, f_w, weight)

                        # negative b/c gradient ascent.
                        divergence_loss = -divergenceLoss(f_w, encoder, p_Xi_samples, p_Zj_samples, p_Ni_samples)

                    gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                    gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                    gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)


                    enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                    dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
            

In [None]:
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep

def initModel(samples, z_dims, n_dims):
    
    k = samples.shape[0]
    N = samples.shape[1]
    time_steps = samples.shape[2]
    dim = samples.shape[3]
    
    
    
    discriminator = createDiscriminator(z_dims, n_dims)
    
    encoders = []
    decoders = []
    
    for i in range(k):
        encoders.append(createEncoder(dim, z_dims, n_dims))
        decoders.append(createDecoder(z_dims, n_dims, time_steps, dim))
    
    return encoders, decoders, discriminator

In [None]:
def translate(start_sequences, samples, encoders, decoders, start_domain, end_domain):
    N = samples.shape[1]
    num_samples = start_sequences.shape[0]
    
    start_encoder = encoders[start_domain]
    end_encoder = encoders[end_domain]
    end_decoder = decoders[end_domain]
    
    end_sequences = end_decoder(projectZ(start_encoder(samples)), projectN(end_encoder(samples[end_domain, np.random.choice(N, num_samples, replace=False),:,:])))
    return end_sequences
    

In [None]:
n_dims = 50 # len(n)
z_dims = 50 # len(Z)

num_epochs = 1
num_samples = 100
# total_latent_dims = n_dims+z_dims # len(N) + len(Z)

In [None]:
encoders, decoders, discriminator = initModel(samples, z_dims, n_dims)

In [None]:
for i in range(num_epochs):
    trainAutoencoders(samples, encoders, decoders, discriminator, num_samples, weight=1.0)

## Evaluation with Rouge

In [None]:
detok = TreebankWordDetokenizer()

evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=False,
                        apply_best=True,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)

In [None]:
def evaluateOnArticles(articles, encoder, decoder):
    translated = decoder(encoder(articles))
       
    original_sentences = [detok.detokenize(tokens) for tokens in articles]
    
    translated_sentences = [vecSeqToSentence(tokens) for tokens in translated]
    
    scores = evaluator.get_scores(translated_sentences, original_sentences)
    
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print('\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results['p'], 'R', 100.0 * results['r'], 'F1', 100.0 * results['f']))

In [None]:
def evaluate(articles_df, encoders, decoders):
    
    publications = articles_df.publication.unique()
    for i in range(len(publications)):
        for j in range(len(publications)):
            if (i != j):
                pub1=publications[i]
                pub2=publications[j]
                source_articles = articles_df.loc[articles_df['publication'] == pub1]['content'].tolist()

                print(pub1,"to",pub2)
                evaluateOnArticles(source_articles, encoders[i], decoders[j])
                print()