### Imports

In [1]:

import pickle as pkl
import pandas as pd
import pandas
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec 
import rouge
from nltk.tokenize.treebank import TreebankWordDetokenizer

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Activation, Bidirectional, Dropout, Input, concatenate, Reshape, TimeDistributed, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import tensorflow.keras.backend as K


### Loading Data

In [2]:
data = pd.read_pickle('data/tokenized.pkl')
eval_df = pd.read_pickle('data/evaluation.pkl')
data

Unnamed: 0.1,Unnamed: 0,publication,content
0,0,New York Times,"[washington, —, congressional, republicans, ha..."
1,1,New York Times,"[after, the, bullet, shells, get, the, south, ..."
2,2,New York Times,"[when, walt, disney, ’, s, but, what, they, di..."
3,3,New York Times,"[death, may, be, the, great, equalizer, ,, but..."
4,4,New York Times,"[seoul, ,, south, korea, —, although, north, k..."
...,...,...,...
47220,47220,BBC_tech,"[bt, is, introducing, two, initiatives, from, ..."
47221,47221,BBC_tech,"[computer, users, across, the, world, more, th..."
47222,47222,BBC_tech,"[a, new, european, directive, could, if, it, g..."
47223,47223,BBC_tech,"[the, man, making, sure, us, amit, yoran, was,..."


In [3]:
all_sentences = list(data['content'])
all_sentences.extend(list(eval_df['content']))

### Getting Relevant publications

In [4]:
# selected_publications = [
#  'Breitbart',
#  'CNN',
#  'New York Times',
#  'NPR',
#  'Fox News',
#  'Reuters']
selected_publications = [
 'Breitbart',
 'CNN',
 'New York Times']

In [5]:
all_publications = list(set(data['publication']))
all_publications

['Atlantic',
 'New York Times',
 'Vox',
 'Reuters',
 'Buzzfeed News',
 'Business Insider',
 'Talking Points Memo',
 'New York Post',
 'Washington Post',
 'BBC_business',
 'BBC_politics',
 'BBC_tech',
 'BBC_sport',
 'Breitbart',
 'Fox News',
 'Guardian',
 'NPR',
 'CNN',
 'National Review',
 'BBC_entertainment']

In [6]:
# Take only the contents from publications with >= 3000 samples.
publications = [pub for pub in selected_publications if pub in all_publications and len(data[data['publication'] == pub]) >= 3000]
publications

['Breitbart', 'CNN', 'New York Times']

In [7]:
contents = []
for pub in publications:
    contents.append(np.asarray(data[data['publication'] == pub]['content']))

### Padding with special Character

In [8]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

50

In [9]:
end_token = '~?@_'

In [10]:
for content in contents:
    for seq in content:
        seq.extend([end_token] * (max_seq_length - len(seq)))

In [11]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

50

### Vectorize Words

In [12]:
word_dim = 100

In [13]:

word2vec = gensim.models.Word2Vec(all_sentences, min_count = 1,  
                              size = word_dim, window = 5) 

In [14]:
print("Cosine similarity between 'congress' " + 
               "and 'senate' - CBOW : ", 
    word2vec.wv.similarity('congress', 'senate')) 
      
print("Cosine similarity between 'congress' " +
                 "and 'house' - CBOW : ", 
    word2vec.wv.similarity('congress', 'house')) 

Cosine similarity between 'congress' and 'senate' - CBOW :  0.7559486
Cosine similarity between 'congress' and 'house' - CBOW :  0.6252873


In [15]:
word2vec.wv['congressional']

array([-0.29592454, -0.8773146 ,  0.76711816, -0.05720106, -0.03461413,
       -0.40311787,  0.22946852,  1.0140978 ,  0.8975274 , -0.7267992 ,
        1.2324462 , -0.43481615,  0.8528818 , -0.7095517 , -0.07889917,
       -0.52104896,  0.2608844 , -0.58594555, -0.59863317, -0.31064552,
       -0.11377191, -0.50565964,  0.15798195,  0.26478073,  1.4427081 ,
        0.55394846,  0.17723931,  0.7052398 , -0.88249487, -0.5539005 ,
        0.2363067 ,  0.06038764, -0.86185366, -0.19701053,  0.14048052,
       -0.45728892,  0.2533323 , -0.26327336, -0.8601514 ,  1.0089282 ,
        0.14991663, -0.44243005, -1.3007863 , -1.164767  , -1.2460694 ,
        0.24592596, -0.501519  , -0.23789279,  0.24771175,  0.13797134,
       -0.20122194,  1.2545592 ,  1.3626865 , -0.5414245 ,  0.9415389 ,
       -0.27835897, -0.4958755 , -0.01836712,  0.8055421 , -0.56852084,
       -0.9491626 , -1.0129623 , -0.07664649, -0.00478025, -1.3793099 ,
        0.02242324, -0.07649595, -0.25851354,  0.30401504,  0.07

In [16]:
word2vec.wv.similar_by_vector(word2vec.wv['congressional'])

[('congressional', 0.9999998807907104),
 ('senate', 0.8712809085845947),
 ('gop', 0.8363369703292847),
 ('committee', 0.8346565961837769),
 ('judiciary', 0.8318742513656616),
 ('lawmakers', 0.8250969052314758),
 ('chambers', 0.8131620287895203),
 ('voted', 0.8018012046813965),
 ('liberal', 0.7954562306404114),
 ('conservative', 0.7942031621932983)]

In [17]:
contents = np.asarray(contents)

In [18]:
samples = np.zeros(shape=(contents.shape[0], contents.shape[1], max_seq_length, word_dim))

In [19]:
for i in range(contents.shape[0]):
    for j in range(contents.shape[1]):
        for k in range(max_seq_length):
            samples[i, j, k, :] = word2vec.wv[contents[i, j][k]]

In [20]:
# TODO use closest cosine distance to find output word.

## Funciton Definitions

In [21]:
def squareError(xTrue, xPred):
    return K.square(xTrue - xPred)


In [22]:
def reconstructionLoss(sample, encoder, decoder, f_w, weight): # (L_1 from the paper)
    return K.mean(squareError(sample, decoder(encoder(sample)))) + K.mean(weight*K.log(f_w(encoder(sample))))


In [23]:
def divergenceLoss(f_w, encoder, sample, z_j, n_j): # Mean of log f_w(E_theta_i(x_j)) + log (1-f_w(z_j, n_j)) from the paper (L_2).
    return K.mean(K.log(f_w(encoder(sample)))) + K.mean(K.log(1 - f_w([z_j, n_j])))


In [24]:
def sample(data, domain, num_samples):
    N = data.shape[1]
    return tf.convert_to_tensor(data[domain, np.random.choice(N, num_samples, replace=True),:,:], dtype=tf.float32)


In [25]:
# Currently just doing a restriction to the last z variables, might want to do a matrix multiplication?
# pi_Z from the paper. projects a latent distribution in (z, n) to z
def projectZ(encoded):
    return encoded[0] # take zs.

In [26]:
def projectN(encoded):
    return encoded[1] # taek Ns.

In [27]:
# takes in two inputs, n and z, and outputs samples.
def createDecoder(z_dims, n_dims, time_steps, output_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?

    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
#     # 150 is arbitrary rn...
#     dense = Dense(150)(inputs)
    dense = Dense(time_steps*output_dims)(inputs)
    reshape = Reshape((time_steps, output_dims))(dense)
    # TODO Reshape to enforce time_steps?
    bilstm = Bidirectional(LSTM(64, activation='tanh', return_sequences=True))(reshape)
    bilstm = Dropout(0.2)(bilstm)
    bilstm = Bidirectional(LSTM(64, activation='tanh', return_sequences=False))(bilstm)
    bilstm = Dropout(0.2)(bilstm)
    
    dense = Dense(time_steps*output_dims, activation='linear')(bilstm)
    outputs = Reshape((time_steps, output_dims))(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=outputs)
    
    return model

In [28]:
def createEncoder(time_steps, input_num, z_dims, n_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?
    inputs = Input(shape=(time_steps, input_num,))
    bilstm = Bidirectional(LSTM(64, activation='tanh', return_sequences=True))(inputs)
    bilstm = Dropout(0.2)(bilstm)
    dense = Bidirectional(LSTM(64, activation='tanh', return_sequences=False))(bilstm)
    dense = Dropout(0.2)(dense)
    z_output = Dense(z_dims, activation='linear')(dense)
    n_output = Dense(n_dims, activation='linear')(dense)
    
    model = Model(inputs=inputs, outputs=[z_output, n_output])
    
    return model

In [29]:
def createDiscriminator(z_dims, n_dims):
    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
    
    # 150, 100 is arbitrary rn...
    dense = Dense(150, activation='relu')(inputs)
    dense = Dense(100, activation='relu')(dense)
    output = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=output)
    
    return model

In [30]:
lr = 5e-4
enc_optimizer = tf.keras.optimizers.Adam(lr)
dec_optimizer = tf.keras.optimizers.Adam(lr)
disc_optimizer = tf.keras.optimizers.Adam(lr)

### When $P_Z$ is known... 

In [31]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# original_domains is a list of the original domains P_z was derived from.

# Currently assuming P_Z is known. Must approximate P_Z first.
def trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=10, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
        
    
    for i in range(k):
        if i not in original_domains:
            original_domain = np.random.choice(original_domains)
            encoder = encoders[i]
            decoder = decoders[i]
            original_encoder = encoders[original_domain]
            epoch = 0
            while(epoch < epochs): # TOOD: could also do until some convergence criteria.
                p_Xi_samples = sample(samples, i, num_samples)
                p_Z_samples = projectZ(original_encoder(sample(samples, original_domain, num_samples)))
                p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                    reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)

                    # negative b/c gradient ascent.
                    divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Z_samples, p_Ni_samples)

                gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)


                enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                
                print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                epoch+=1
            

### When $P_Z$ is unknown...
"A straight-forward approach for learning the latent distribution PZ is to train a regularized autoencoder on data from a
single representative domain. However, such a representation could potentially capture variability that is specific to
that one domain. To learn a more invariant latent representation, we propose the following extension of our autoencoder
framework. The basic idea is to alternate between training
multiple autoencoders until they agree on a latent representation that is effective for their respective domains. This is
particularly relevant for applications to biology; for example, often one is interested in learning a latent representation
that integrates all of the data modalities."

In [32]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# domains is a list of the domains we are currently training over.

def trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, domains, epochs=10, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
    
    for i in domains:
        encoder = encoders[i]
        decoder = decoders[i]
        for j in domains:
            if i != j:
                j_encoder = encoders[j]
                epoch = 0
                while(epoch < epochs): # TOOD: could also do until some convergence criteria.
                    p_Xi_samples = sample(samples, i, num_samples)
                    p_Zj_samples = projectZ(j_encoder(sample(samples, j, num_samples)))
                    p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                    with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                        reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)
#                         print(p_Xi_samples)

                        # negative b/c gradient ascent.
                        divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Zj_samples, p_Ni_samples)
#                         print(p_Zj_samples)
#                         print(p_Ni_samples)
                        
                    gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                    gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                    gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)
            

                    enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                    dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                    
                    print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                    epoch+=1
            

In [33]:
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep

def initModel(samples, z_dims, n_dims):
    
    k = samples.shape[0]
    N = samples.shape[1]
    time_steps = samples.shape[2]
    dim = samples.shape[3]
    
    
    
    discriminator = createDiscriminator(z_dims, n_dims)
    
    encoders = []
    decoders = []
    
    for i in range(k):
        encoders.append(createEncoder(time_steps, dim, z_dims, n_dims))
        decoders.append(createDecoder(z_dims, n_dims, time_steps, dim))
    
    return encoders, decoders, discriminator

In [34]:
def translate(start_sequences, samples, encoders, decoders, start_domain, end_domain):
    N = samples.shape[1]
    print(start_sequences.shape)
    num_samples = start_sequences.shape[0]
    
    start_encoder = encoders[start_domain]
    end_encoder = encoders[end_domain]
    end_decoder = decoders[end_domain]
    
    z = projectZ(start_encoder(start_sequences))
    n = projectN(end_encoder(sample(samples, end_domain, num_samples)))
    
    end_sequences = end_decoder([z, n])
    return end_sequences
    

In [35]:
def vecSeqToSentence(sequence):
    sequence = K.eval(sequence)
    sentence = []
    for i in range(sequence.shape[0]):
        word = sequence[i,:]
#         print(word)
#         print(word2vec.wv.similar_by_vector(word))
        sentence.append(word2vec.wv.similar_by_vector(word)[0][0])
    print(sentence)

In [55]:
n_dims = 60 # len(n)
z_dims = 240 # len(Z)

num_epochs = 100
num_samples = 128

weight = 1

original_domains = [0, 1]



In [56]:
# samples = tf.convert_to_tensor(samples)

In [57]:
encoders, decoders, discriminator = initModel(samples, z_dims, n_dims)

### Original First Sentence from 0

In [58]:
' '.join(contents[0, 0])

'on tuesday ’ s broadcast ” zeleny said , “ and she ’ s having a difficult time in federal prison , no question . ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_'

In [59]:
seq = tf.convert_to_tensor(np.asarray([samples[0, 0, :, :]]), dtype=tf.float32)
translation = translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])

(1, 50, 100)


### Original First Sentence from 0 translated to 1 before Training (Random)

In [60]:

vecSeqToSentence(translation[0,:,:])


['yasser', 'dominguez', 'fronds', "'can", 'fewest', 'insignia', '151', 'unyielding', 'affectless', 'corden', 'brawls', 'improvisational', 'sikhs', 'barracks', 'affronted', 'exposes', 'downstairs', 'guinean', 'munger', 'overdrive', 'ayer', 'throng', 'kokomo', 'prothero', 'obscurities', 'americas', 'tedious', 'transients', 'winship', 'autolink', 'artthrob', 'ruckus', 'helpfully', 'hunh', 'biographies', 'turkeys', 'viridiana', 'soapbox', 'baahubali', 'welty', 'maples', 'solstice', 'pelletz', 'dissolution', 'cornwell', 'punishments', 'epitaph', 'inhofe', 'chatterjee', 'aloha']


In [61]:
trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=num_epochs, weight=weight)


Domain 0, Epoch 1:
	Reconstruction Loss: 0.18974536657333374
	Divergence Loss: 1.4089030027389526
Domain 0, Epoch 2:
	Reconstruction Loss: 0.13334870338439941
	Divergence Loss: 1.4225118160247803
Domain 0, Epoch 3:
	Reconstruction Loss: 0.07649147510528564
	Divergence Loss: 1.440427541732788
Domain 0, Epoch 4:
	Reconstruction Loss: 0.0489310622215271
	Divergence Loss: 1.4419782161712646
Domain 0, Epoch 5:
	Reconstruction Loss: -0.009554445743560791
	Divergence Loss: 1.462437391281128
Domain 0, Epoch 6:
	Reconstruction Loss: -0.10720270872116089
	Divergence Loss: 1.4892263412475586
Domain 0, Epoch 7:
	Reconstruction Loss: -0.17957699298858643
	Divergence Loss: 1.5167958736419678
Domain 0, Epoch 8:
	Reconstruction Loss: -0.249464213848114
	Divergence Loss: 1.5463027954101562
Domain 0, Epoch 9:
	Reconstruction Loss: -0.333532452583313
	Divergence Loss: 1.5867674350738525
Domain 0, Epoch 10:
	Reconstruction Loss: -0.41586029529571533
	Divergence Loss: 1.635570764541626
Domain 0, Epoch 11:


Domain 0, Epoch 85:
	Reconstruction Loss: 0.48573654890060425
	Divergence Loss: 0.22616234421730042
Domain 0, Epoch 86:
	Reconstruction Loss: 0.45798158645629883
	Divergence Loss: 0.21308661997318268
Domain 0, Epoch 87:
	Reconstruction Loss: 0.4479249119758606
	Divergence Loss: 0.21275237202644348
Domain 0, Epoch 88:
	Reconstruction Loss: 0.4343830347061157
	Divergence Loss: 0.2037571370601654
Domain 0, Epoch 89:
	Reconstruction Loss: 0.4366544485092163
	Divergence Loss: 0.20952829718589783
Domain 0, Epoch 90:
	Reconstruction Loss: 0.4278284013271332
	Divergence Loss: 0.2173108160495758
Domain 0, Epoch 91:
	Reconstruction Loss: 0.3864607512950897
	Divergence Loss: 0.23116466403007507
Domain 0, Epoch 92:
	Reconstruction Loss: 0.3799397051334381
	Divergence Loss: 0.23971311748027802
Domain 0, Epoch 93:
	Reconstruction Loss: 0.36212873458862305
	Divergence Loss: 0.27255532145500183
Domain 0, Epoch 94:
	Reconstruction Loss: 0.35209590196609497
	Divergence Loss: 0.29946786165237427
Domain 0

Domain 1, Epoch 68:
	Reconstruction Loss: 0.5511907935142517
	Divergence Loss: 0.0029586083255708218
Domain 1, Epoch 69:
	Reconstruction Loss: 0.5698443651199341
	Divergence Loss: 0.0028308588080108166
Domain 1, Epoch 70:
	Reconstruction Loss: 0.566948413848877
	Divergence Loss: 0.0027440800331532955
Domain 1, Epoch 71:
	Reconstruction Loss: 0.5435666441917419
	Divergence Loss: 0.0027027197647839785
Domain 1, Epoch 72:
	Reconstruction Loss: 0.5555870532989502
	Divergence Loss: 0.0026355076115578413
Domain 1, Epoch 73:
	Reconstruction Loss: 0.5775895714759827
	Divergence Loss: 0.00252106343396008
Domain 1, Epoch 74:
	Reconstruction Loss: 0.5342122912406921
	Divergence Loss: 0.002468107733875513
Domain 1, Epoch 75:
	Reconstruction Loss: 0.53425532579422
	Divergence Loss: 0.002456900430843234
Domain 1, Epoch 76:
	Reconstruction Loss: 0.5065349340438843
	Divergence Loss: 0.002361276187002659
Domain 1, Epoch 77:
	Reconstruction Loss: 0.5537890791893005
	Divergence Loss: 0.002361312275752425

### Original First Sentence from 0 translated to 1 after training

In [62]:
translation = translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])
vecSeqToSentence(translation[0,:,:])

(1, 50, 100)
['(', 'cnn', ')', ')', 'although', 'the', 'although', 'although', 'although', 'although', '”', 'word', 'although', 'short', 'short', 'once', 'short', 'short', 'short', 'trouble', 'trouble', 'trouble', 'frustration', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_']


In [63]:
trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=num_epochs, weight=weight)


Domain 2, Epoch 1:
	Reconstruction Loss: 0.534590482711792
	Divergence Loss: 14.22704029083252
Domain 2, Epoch 2:
	Reconstruction Loss: 0.3670617938041687
	Divergence Loss: 12.272100448608398
Domain 2, Epoch 3:
	Reconstruction Loss: 0.13843506574630737
	Divergence Loss: 9.682933807373047
Domain 2, Epoch 4:
	Reconstruction Loss: -0.10037904977798462
	Divergence Loss: 6.9949564933776855
Domain 2, Epoch 5:
	Reconstruction Loss: -0.36392509937286377
	Divergence Loss: 4.375701904296875
Domain 2, Epoch 6:
	Reconstruction Loss: -0.8513364195823669
	Divergence Loss: 2.4454500675201416
Domain 2, Epoch 7:
	Reconstruction Loss: -1.432175874710083
	Divergence Loss: 2.2602298259735107
Domain 2, Epoch 8:
	Reconstruction Loss: -2.2230114936828613
	Divergence Loss: 2.9392647743225098
Domain 2, Epoch 9:
	Reconstruction Loss: -3.078900098800659
	Divergence Loss: 3.732088804244995
Domain 2, Epoch 10:
	Reconstruction Loss: -3.9518747329711914
	Divergence Loss: 4.559657573699951
Domain 2, Epoch 11:
	Recons

Domain 2, Epoch 84:
	Reconstruction Loss: 0.576836884021759
	Divergence Loss: 0.0005801574443466961
Domain 2, Epoch 85:
	Reconstruction Loss: 0.5370258688926697
	Divergence Loss: 0.0005668889498338103
Domain 2, Epoch 86:
	Reconstruction Loss: 0.5871444344520569
	Divergence Loss: 0.0005667031509801745
Domain 2, Epoch 87:
	Reconstruction Loss: 0.532485842704773
	Divergence Loss: 0.0005634379922412336
Domain 2, Epoch 88:
	Reconstruction Loss: 0.5096260905265808
	Divergence Loss: 0.0005592532106675208
Domain 2, Epoch 89:
	Reconstruction Loss: 0.5665640830993652
	Divergence Loss: 0.000562068191356957
Domain 2, Epoch 90:
	Reconstruction Loss: 0.5254238843917847
	Divergence Loss: 0.0005510277114808559
Domain 2, Epoch 91:
	Reconstruction Loss: 0.5253952741622925
	Divergence Loss: 0.0005468283197842538
Domain 2, Epoch 92:
	Reconstruction Loss: 0.5448578596115112
	Divergence Loss: 0.0005548103363253176
Domain 2, Epoch 93:
	Reconstruction Loss: 0.5414652228355408
	Divergence Loss: 0.0005565705359

### Original First Sentence from 0 translated to 2 after Training

In [64]:
translation = translate(seq, samples, encoders, decoders, 0, 2)
vecSeqToSentence(translation[0,:,:])

(1, 50, 100)
['washington', '—', 'although', 'although', 'although', 'the', 'discussion', 'once', 'short', 'short', 'because', 'short', 'short', 'short', 'short', 'short', 'short', 'short', 'short', '~?@_', 'trouble', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_', '~?@_']


## Evaluation with Rouge

In [None]:
eval_df

In [None]:
contents = []
for pub in publications:
    contents.append(np.asarray(eval_df[eval_df['publication'] == pub]['content']))
    
for content in contents:
    for seq in content:
        seq.extend([end_token] * (max_seq_length - len(seq)))
    
contents = np.asarray(contents)
samples = np.zeros(shape=(contents.shape[0], contents.shape[1], max_seq_length, word_dim))

for i in range(contents.shape[0]):
    for j in range(contents.shape[1]):
        for k in range(max_seq_length):
            samples[i, j, k, :] = word2vec.wv[contents[i, j][k]]

In [None]:
detok = TreebankWordDetokenizer()

evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=False,
                        apply_best=True,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)

In [None]:
def evaluateOnArticles(articles, encoder, decoder):
    translated = decoder(encoder(tf.convert_to_tensor(articles, dtype=tf.float32)))
       
    original_sentences = [vecSeqToSentence(tokens) for tokens in articles]
    
    translated_sentences = [vecSeqToSentence(tokens) for tokens in translated]
    
    scores = evaluator.get_scores(translated_sentences, original_sentences)
    
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print('\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results['p'], 'R', 100.0 * results['r'], 'F1', 100.0 * results['f']))

In [None]:
def evaluate(articles, encoders, decoders):
    
    for i in range(len(selected_publications)):
        for j in range(len(selected_publications)):
            if (i != j):
                pub1=publications[i]
                pub2=publications[j]
                #source_articles = articles_df.loc[articles_df['publication'] == pub1]['content'].tolist()
                source_articles = articles[i]
                
                print(pub1,"to",pub2)
                evaluateOnArticles(source_articles, encoders[i], decoders[j])
                print()

In [None]:
evaluate(samples, encoders, decoders)