### Imports

In [1]:

import pickle as pkl
import pandas as pd
import pandas
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec 
import rouge
from nltk.tokenize.treebank import TreebankWordDetokenizer

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Activation, Bidirectional, Dropout, Input, concatenate, Reshape, TimeDistributed, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import tensorflow.keras.backend as K


### Loading Data

In [2]:
data = pd.read_pickle('data/tokenized.pkl')
data

Unnamed: 0.1,Unnamed: 0,publication,content
0,0,New York Times,"[washington, —, congressional, republicans, ha..."
1,1,New York Times,"[after, the, bullet, shells, get, counted, ,, ..."
2,2,New York Times,"[when, walt, disney, ’, s, “, bambi, ”, opened..."
3,3,New York Times,"[death, may, be, the, great, equalizer, ,, but..."
4,4,New York Times,"[seoul, ,, south, korea, —, north, korea, ’, s..."
...,...,...,...
47220,47220,BBC_tech,"[bt, is, introducing, two, initiatives, to, he..."
47221,47221,BBC_tech,"[computer, users, across, the, world, continue..."
47222,47222,BBC_tech,"[a, new, european, directive, could, put, soft..."
47223,47223,BBC_tech,"[the, man, making, sure, us, computer, network..."


In [3]:
all_sentences = list(data['content'])

### Getting Relevant publications

In [4]:
# selected_publications = [
#  'Breitbart',
#  'CNN',
#  'New York Times',
#  'NPR',
#  'Fox News',
#  'Reuters']
selected_publications = [
 'Breitbart',
 'CNN',
 'New York Times']

In [5]:
all_publications = list(set(data['publication']))
all_publications

['Reuters',
 'NPR',
 'Guardian',
 'Atlantic',
 'Business Insider',
 'Talking Points Memo',
 'Vox',
 'Washington Post',
 'New York Times',
 'CNN',
 'BBC_entertainment',
 'BBC_politics',
 'National Review',
 'BBC_business',
 'New York Post',
 'Fox News',
 'BBC_tech',
 'Breitbart',
 'BBC_sport',
 'Buzzfeed News']

In [6]:
# Take only the contents from publications with >= 3000 samples.
publications = [pub for pub in selected_publications if pub in all_publications and len(data[data['publication'] == pub]) >= 3000]
publications

['Breitbart', 'CNN', 'New York Times']

In [7]:
contents = []
for pub in publications:
    contents.append(np.asarray(data[data['publication'] == pub]['content']))

### Padding with special Character

In [8]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

307

In [9]:
end_token = '~?@_'

In [10]:
for content in contents:
    for seq in content:
        seq.extend([end_token] * (max_seq_length - len(seq)))

In [11]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

307

### Vectorize Words

In [12]:
word_dim = 100

In [13]:

word2vec = gensim.models.Word2Vec(all_sentences, min_count = 1,  
                              size = word_dim, window = 5) 

In [14]:
print("Cosine similarity between 'congress' " + 
               "and 'senate' - CBOW : ", 
    word2vec.wv.similarity('congress', 'senate')) 
      
print("Cosine similarity between 'congress' " +
                 "and 'house' - CBOW : ", 
    word2vec.wv.similarity('congress', 'house')) 

Cosine similarity between 'congress' and 'senate' - CBOW :  0.69307524
Cosine similarity between 'congress' and 'house' - CBOW :  0.527322


In [15]:
word2vec.wv['congressional']

array([ 1.6861558 , -1.7420955 ,  1.3501394 , -2.9634385 ,  0.9303591 ,
        2.599446  ,  1.6641629 ,  0.7114974 ,  0.5416314 , -1.9632931 ,
       -1.0708419 ,  1.0549029 ,  0.28574467, -1.2248058 ,  2.1040874 ,
       -2.8837767 , -1.7665795 , -1.6989278 , -0.65262026,  0.12151986,
       -0.4298774 ,  0.16583833,  1.9424665 ,  2.9470232 ,  3.887441  ,
        0.6652238 , -0.34192714, -2.0701778 , -1.6376003 ,  2.294699  ,
       -0.63751656, -2.7813628 , -0.22627059,  0.6528427 , -0.14312306,
       -1.2744355 ,  1.2399527 ,  2.4943454 ,  0.90221465,  2.3333092 ,
       -0.1188041 ,  1.3899851 , -1.9874605 , -0.25326738,  2.286344  ,
       -2.2747042 , -0.7860327 , -0.3404817 ,  2.185882  ,  1.0878699 ,
       -0.6699287 , -1.5642676 , -0.9496857 , -1.855672  , -0.2806676 ,
        0.22064644, -0.42018828,  0.5700985 ,  1.4853765 , -1.3621356 ,
        0.7426257 , -0.24489874, -0.60345906,  0.55355644,  0.666603  ,
       -2.3434956 ,  2.2809849 ,  1.6558722 , -0.6875949 , -0.98

In [16]:
word2vec.wv.similar_by_vector(word2vec.wv['congressional'])

[('congressional', 1.0),
 ('house', 0.667590320110321),
 ('congress', 0.6488298177719116),
 ('senate', 0.6060051918029785),
 ('legislative', 0.5924760699272156),
 ('bipartisan', 0.5719712972640991),
 ('gop', 0.5696830749511719),
 ('lawmakers', 0.5650408864021301),
 ('republican', 0.5463024973869324),
 ('chamber', 0.5334421992301941)]

In [17]:
contents = np.asarray(contents)

In [18]:
samples = np.zeros(shape=(contents.shape[0], contents.shape[1], max_seq_length, word_dim))

In [19]:
for i in range(contents.shape[0]):
    for j in range(contents.shape[1]):
        for k in range(max_seq_length):
            samples[i, j, k, :] = word2vec.wv[contents[i, j][k]]

In [20]:
# TODO use closest cosine distance to find output word.

## Funciton Definitions

In [21]:
def squareError(xTrue, xPred):
    return K.square(xTrue - xPred)


In [22]:
def reconstructionLoss(sample, encoder, decoder, f_w, weight): # (L_1 from the paper)
    return K.mean(squareError(sample, decoder(encoder(sample)))) + K.mean(weight*K.log(f_w(encoder(sample))))


In [23]:
def divergenceLoss(f_w, encoder, sample, z_j, n_j): # Mean of log f_w(E_theta_i(x_j)) + log (1-f_w(z_j, n_j)) from the paper (L_2).
    return K.mean(K.log(f_w(encoder(sample)))) + K.mean(K.log(1 - f_w([z_j, n_j])))


In [24]:
def sample(data, domain, num_samples):
    N = data.shape[1]
    return tf.convert_to_tensor(data[domain, np.random.choice(N, num_samples, replace=True),:,:], dtype=tf.float32)


In [25]:
# Currently just doing a restriction to the last z variables, might want to do a matrix multiplication?
# pi_Z from the paper. projects a latent distribution in (z, n) to z
def projectZ(encoded):
    return encoded[0] # take zs.

In [26]:
def projectN(encoded):
    return encoded[1] # taek Ns.

In [27]:
# takes in two inputs, n and z, and outputs samples.
def createDecoder(z_dims, n_dims, time_steps, output_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?

    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
#     # 150 is arbitrary rn...
#     dense = Dense(150)(inputs)
    dense = Dense(time_steps*output_dims)(inputs)
    reshape = Reshape((time_steps, output_dims))(dense)
    # TODO Reshape to enforce time_steps?
    bilstm = Bidirectional(LSTM(64, activation='tanh', return_sequences=True))(reshape)
    bilstm = Dropout(0.2)(bilstm)
    bilstm = Bidirectional(LSTM(64, activation='tanh', return_sequences=False))(bilstm)
    bilstm = Dropout(0.2)(bilstm)
    
    dense = Dense(time_steps*output_dims, activation='linear')(bilstm)
    outputs = Reshape((time_steps, output_dims))(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=outputs)
    
    return model

In [28]:
def createEncoder(time_steps, input_num, z_dims, n_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?
    inputs = Input(shape=(time_steps, input_num,))
    bilstm = Bidirectional(LSTM(64, activation='tanh', return_sequences=True))(inputs)
    bilstm = Dropout(0.2)(bilstm)
    dense = Bidirectional(LSTM(64, activation='tanh', return_sequences=False))(bilstm)
    dense = Dropout(0.2)(dense)
    z_output = Dense(z_dims, activation='linear')(dense)
    n_output = Dense(n_dims, activation='linear')(dense)
    
    model = Model(inputs=inputs, outputs=[z_output, n_output])
    
    return model

In [29]:
def createDiscriminator(z_dims, n_dims):
    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
    
    # 150, 100 is arbitrary rn...
    dense = Dense(150, activation='relu')(inputs)
    dense = Dense(100, activation='relu')(dense)
    output = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=output)
    
    return model

In [30]:
lr = 5e-4
enc_optimizer = tf.keras.optimizers.Adam(lr)
dec_optimizer = tf.keras.optimizers.Adam(lr)
disc_optimizer = tf.keras.optimizers.Adam(lr)

### When $P_Z$ is known... 

In [31]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# original_domains is a list of the original domains P_z was derived from.

# Currently assuming P_Z is known. Must approximate P_Z first.
def trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=10, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
        
    
    for i in range(k):
        if i not in original_domains:
            original_domain = np.random.choice(original_domains)
            encoder = encoders[i]
            decoder = decoders[i]
            original_encoder = encoders[original_domain]
            epoch = 0
            while(epoch < epochs): # TOOD: could also do until some convergence criteria.
                p_Xi_samples = sample(samples, i, num_samples)
                p_Z_samples = projectZ(original_encoder(sample(samples, original_domain, num_samples)))
                p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                    reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)

                    # negative b/c gradient ascent.
                    divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Z_samples, p_Ni_samples)

                gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)


                enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                
                print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                epoch+=1
            

### When $P_Z$ is unknown...
"A straight-forward approach for learning the latent distribution PZ is to train a regularized autoencoder on data from a
single representative domain. However, such a representation could potentially capture variability that is specific to
that one domain. To learn a more invariant latent representation, we propose the following extension of our autoencoder
framework. The basic idea is to alternate between training
multiple autoencoders until they agree on a latent representation that is effective for their respective domains. This is
particularly relevant for applications to biology; for example, often one is interested in learning a latent representation
that integrates all of the data modalities."

In [32]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# domains is a list of the domains we are currently training over.

def trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, domains, epochs=10, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
    
    for i in domains:
        encoder = encoders[i]
        decoder = decoders[i]
        for j in domains:
            if i != j:
                j_encoder = encoders[j]
                epoch = 0
                while(epoch < epochs): # TOOD: could also do until some convergence criteria.
                    p_Xi_samples = sample(samples, i, num_samples)
                    p_Zj_samples = projectZ(j_encoder(sample(samples, j, num_samples)))
                    p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                    with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                        reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)
#                         print(p_Xi_samples)

                        # negative b/c gradient ascent.
                        divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Zj_samples, p_Ni_samples)
#                         print(p_Zj_samples)
#                         print(p_Ni_samples)
                        
                    gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                    gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                    gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)
            

                    enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                    dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                    
                    print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                    epoch+=1
            

In [33]:
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep

def initModel(samples, z_dims, n_dims):
    
    k = samples.shape[0]
    N = samples.shape[1]
    time_steps = samples.shape[2]
    dim = samples.shape[3]
    
    
    
    discriminator = createDiscriminator(z_dims, n_dims)
    
    encoders = []
    decoders = []
    
    for i in range(k):
        encoders.append(createEncoder(time_steps, dim, z_dims, n_dims))
        decoders.append(createDecoder(z_dims, n_dims, time_steps, dim))
    
    return encoders, decoders, discriminator

In [34]:
def translate(start_sequences, samples, encoders, decoders, start_domain, end_domain):
    N = samples.shape[1]
    print(start_sequences.shape)
    num_samples = start_sequences.shape[0]
    
    start_encoder = encoders[start_domain]
    end_encoder = encoders[end_domain]
    end_decoder = decoders[end_domain]
    
    z = projectZ(start_encoder(start_sequences))
    n = projectN(end_encoder(sample(samples, end_domain, num_samples)))
    
    end_sequences = end_decoder([z, n])
    return end_sequences
    

In [35]:
def vecSeqToSentence(sequence):
    sequence = K.eval(sequence)
    sentence = []
    for i in range(sequence.shape[0]):
        word = sequence[i,:]
#         print(word)
#         print(word2vec.wv.similar_by_vector(word))
        sentence.append(word2vec.wv.similar_by_vector(word)[0][0])
    print(sentence)

In [43]:
n_dims = 60 # len(n)
z_dims = 240 # len(Z)

num_epochs = 20
num_samples = 128

original_domains = [0, 1]



In [44]:
# samples = tf.convert_to_tensor(samples)

In [45]:
encoders, decoders, discriminator = initModel(samples, z_dims, n_dims)

### Original First Sentence from 0

In [46]:
' '.join(contents[0, 0])

'on tuesday ’ s broadcast of cnn ’ s “ situation room , ” cnn senior washington correspondent jeff zeleny stated that chelsea manning ’ s transition from a man to a woman “ certainly played into ” president obama ’ s decision to commute manning ’ s sentence , and “ without that , it ’ s hard to imagine , i think , this president would have done that . ” zeleny said , “ i think a question that this president the white house will have to answer here — will answer , and i think it ’ s an important one , if — how much was the personal story of chelsea manning involved in this , because the outcry from the left was so strong on this . and she ’ s having a difficult time in federal prison , no question . but , to me , that is a central question here . without that , you have to wonder if the outcome would be the same . i think it might not be . ” he added , “ [ b ] ecause she transitioned from a man to a woman , i think all of that certainly played into this . ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~

In [47]:
seq = tf.convert_to_tensor(np.asarray([samples[0, 0, :, :]]), dtype=tf.float32)
translation = translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])

(1, 307, 100)


### Original First Sentence from 0 translated to 1 before Training (Random)

In [48]:

vecSeqToSentence(translation[0,:,:])


['lasorda', 'putinrf_eng', 'accommodates', 'saperstein', 'declared', 'rare', 'constables', 'still', 'mallaby', 'msnbchttps', 'ishaan', 'evergreen', 'insult.', 'toll', 'crucifying', 'expedition.', 'cbo', 'beiderbecke', 'mccarthy', 'zarah', 'community', 'schlozman', 'achievement.', 'olympique', "'jerry", 'superheated', 'serpico', 'probyn', '32m', 'shirking', 'masseur', 'patience.', 'flemington', 'laurate', 'gravitationally', 'aussie', 'luv', 'unopposed', 'subverts', 'boothe', 'anyanwu', 'bardsley', 'lockerbie', 'scrabble', 'interjections', 'foreshortened', 'duet', 'coherent', 'leaves', 'secrétariat', '3099', 'bridgettines', "'during", 'aberrations', 'decamp', 'kincaid', 'cements', 'activists', 'showin', 'lerach', 'k.', 'saalihah', 'northumberland.', 'shrove', 'disengagement', 'aboubakar', 'stiviano', 'sabolik', 'candidates', 'pleasingly', 'wasters', 'wilsonian', '3:06', 'whisky', 'worm.', 'hornby', 'zipper', 'badaling', 'defy', '20‑year‑old', 'easy-to-understand', 'your', 'obstruct', 'eq

In [None]:
trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=num_epochs, weight=3)


Domain 0, Epoch 1:
	Reconstruction Loss: -0.5384466648101807
	Divergence Loss: 1.356541395187378
Domain 0, Epoch 2:
	Reconstruction Loss: -0.7197122573852539
	Divergence Loss: 1.3887667655944824
Domain 0, Epoch 3:
	Reconstruction Loss: -0.8584140539169312
	Divergence Loss: 1.4169220924377441
Domain 0, Epoch 4:
	Reconstruction Loss: -1.0154470205307007
	Divergence Loss: 1.4374992847442627
Domain 0, Epoch 5:
	Reconstruction Loss: -1.1578400135040283
	Divergence Loss: 1.4678691625595093
Domain 0, Epoch 6:
	Reconstruction Loss: -1.3188843727111816
	Divergence Loss: 1.4957636594772339
Domain 0, Epoch 7:
	Reconstruction Loss: -1.4426950216293335
	Divergence Loss: 1.5021741390228271
Domain 0, Epoch 8:
	Reconstruction Loss: -1.469617247581482
	Divergence Loss: 1.5013971328735352
Domain 0, Epoch 9:
	Reconstruction Loss: -1.5720322132110596
	Divergence Loss: 1.5008827447891235
Domain 0, Epoch 10:
	Reconstruction Loss: -1.6035209894180298
	Divergence Loss: 1.4994441270828247
Domain 0, Epoch 11:
	

### Original First Sentence from 0 translated to 1 after training

In [None]:
translation = translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])
vecSeqToSentence(translation[0,:,:])

In [None]:
trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, epochs=num_epochs, weight=3)


### Original First Sentence from 0 translated to 2 after Training

In [None]:
translation = translate(seq, samples, encoders, decoders, 0, 2)
vecSeqToSentence(translation[0,:,:])

## Evaluation with Rouge

In [None]:
df = pd.read_pickle('data/evaluation.pkl')
df

In [None]:
contents = []
for pub in publications:
    contents.append(np.asarray(df[df['publication'] == pub]['content']))
    
for content in contents:
    for seq in content:
        seq.extend([end_token] * (max_seq_length - len(seq)))
    
contents = np.asarray(contents)
samples = np.zeros(shape=(contents.shape[0], contents.shape[1], max_seq_length, word_dim))

for i in range(contents.shape[0]):
    for j in range(contents.shape[1]):
        for k in range(max_seq_length):
            samples[i, j, k, :] = word2vec.wv[contents[i, j][k]]

In [None]:
detok = TreebankWordDetokenizer()

evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=False,
                        apply_best=True,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)

In [None]:
def evaluateOnArticles(articles, encoder, decoder):
    translated = decoder(encoder(tf.convert_to_tensor(articles, dtype=tf.float32)))
       
    original_sentences = [detok.detokenize(tokens) for tokens in articles]
    
    translated_sentences = [vecSeqToSentence(tokens) for tokens in translated]
    
    scores = evaluator.get_scores(translated_sentences, original_sentences)
    
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print('\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results['p'], 'R', 100.0 * results['r'], 'F1', 100.0 * results['f']))

In [None]:
def evaluate(articles, encoders, decoders):
    
    publications = articles_df.publication.unique()
    for i in range(len(publications)):
        for j in range(len(publications)):
            if (i != j):
                pub1=publications[i]
                pub2=publications[j]
                #source_articles = articles_df.loc[articles_df['publication'] == pub1]['content'].tolist()
                source_articles = articles[i]
                
                print(pub1,"to",pub2)
                evaluateOnArticles(source_articles, encoders[i], decoders[j])
                print()

In [None]:
evaluate(samples, encoders, decoders)