### Imports

In [1]:

import pickle as pkl
import pandas as pd
import pandas
import numpy as np
from nltk.tokenize import sent_tokenize, word_tokenize 
import gensim 
from gensim.models import Word2Vec 
import rouge
from nltk.tokenize.treebank import TreebankWordDetokenizer

import tensorflow as tf
from tensorflow.keras.layers import Dense, LSTM, Activation, Bidirectional, Dropout, Input, concatenate, Reshape, TimeDistributed, Flatten
from tensorflow.keras.models import Sequential
from tensorflow.keras import Model
import tensorflow.keras.backend as K


### Loading Data

In [2]:
data = pd.read_pickle('data/tokenized.pkl')
data

Unnamed: 0.1,Unnamed: 0,publication,content
0,0,New York Times,"[WASHINGTON, —, Congressional, Republicans, ha..."
1,1,New York Times,"[After, the, bullet, shells, get, counted, ,, ..."
2,2,New York Times,"[When, Walt, Disney, ’, s, “, Bambi, ”, opened..."
3,3,New York Times,"[Death, may, be, the, great, equalizer, ,, but..."
4,4,New York Times,"[SEOUL, ,, South, Korea, —, North, Korea, ’, s..."
...,...,...,...
47220,47220,BBC_tech,"[BT, is, introducing, two, initiatives, to, he..."
47221,47221,BBC_tech,"[Computer, users, across, the, world, continue..."
47222,47222,BBC_tech,"[A, new, European, directive, could, put, soft..."
47223,47223,BBC_tech,"[The, man, making, sure, US, computer, network..."


In [3]:
all_sentences = [s for s in list(data['content'])]

AttributeError: 'list' object has no attribute 'lower'

### Getting Relevant publications

In [4]:
# selected_publications = [
#  'Breitbart',
#  'CNN',
#  'New York Times',
#  'NPR',
#  'Fox News',
#  'Reuters']
selected_publications = [
 'Breitbart',
 'CNN',
 'New York Times']

In [5]:
all_publications = list(set(data['publication']))
all_publications

['Business Insider',
 'Guardian',
 'BBC_business',
 'BBC_tech',
 'Breitbart',
 'CNN',
 'New York Post',
 'Talking Points Memo',
 'NPR',
 'Reuters',
 'BBC_entertainment',
 'Vox',
 'BBC_sport',
 'Washington Post',
 'BBC_politics',
 'Fox News',
 'National Review',
 'New York Times',
 'Atlantic',
 'Buzzfeed News']

In [6]:
# Take only the contents from publications with >= 3000 samples.
publications = [pub for pub in all_publications if len(data[data['publication'] == pub]) >= 3000 and pub in selected_publications]
publications

['Breitbart', 'CNN', 'New York Times']

In [7]:
contents = []
for pub in publications:
    contents.append(np.asarray(data[data['publication'] == pub]['content']))

### Padding with special Character

In [8]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

307

In [9]:
end_token = '~?@_'

In [10]:
for content in contents:
    for seq in content:
        seq.extend([end_token] * (max_seq_length - len(seq)))

In [11]:
max_seq_length = max([len(seq) for content in contents for seq in content])
max_seq_length

307

### Vectorize Words

In [12]:
word_dim = 100

In [13]:

word2vec = gensim.models.Word2Vec(all_sentences, min_count = 1,  
                              size = word_dim, window = 5) 

In [14]:
print("Cosine similarity between 'congress' " + 
               "and 'senate' - CBOW : ", 
    word2vec.wv.similarity('congress', 'senate')) 
      
print("Cosine similarity between 'congress' " +
                 "and 'house' - CBOW : ", 
    word2vec.wv.similarity('congress', 'house')) 

Cosine similarity between 'congress' and 'senate' - CBOW :  0.627234
Cosine similarity between 'congress' and 'house' - CBOW :  0.322459


In [15]:
word2vec.wv['Congressional']

array([ 1.879404  , -0.6960241 , -2.002039  , -0.68278575,  2.5041    ,
        0.3254695 , -0.3144759 , -1.5881237 ,  1.9760429 , -1.7036556 ,
        0.8418549 , -2.9207084 ,  0.04419165, -0.1655924 , -1.0727352 ,
        0.21952105, -1.9195368 , -0.85202515, -0.63410944,  0.83780265,
        0.8345168 ,  1.2770362 , -0.98107356, -1.955772  , -1.9032052 ,
        2.6256256 ,  0.2640103 , -0.7434525 , -1.6298252 ,  0.47596285,
       -0.98084545, -0.17891128,  0.16102064, -0.59793156, -1.5453395 ,
        0.95639944, -0.328981  ,  0.98253804,  1.5304171 ,  0.09373295,
       -1.5775791 , -0.03829025, -0.8097161 ,  1.6272002 , -2.384383  ,
       -0.08229817, -1.5934962 , -0.7468626 , -1.3243375 ,  2.1236775 ,
       -0.4236945 ,  0.7089251 , -2.1274877 , -0.5048456 , -1.0742601 ,
        0.09096817,  0.9541912 ,  1.2403318 , -1.3420023 , -0.961968  ,
        2.5708044 , -0.27389082,  0.7332985 , -1.8888053 ,  0.46008825,
        0.46478906, -0.7850892 ,  1.3387531 ,  0.69220155, -0.54

In [16]:
word2vec.wv.similar_by_vector(word2vec.wv['Congressional'])

[('Congressional', 1.0000001192092896),
 ('Budget', 0.8493582010269165),
 ('Ethics', 0.7468824982643127),
 ('Government', 0.7225774526596069),
 ('Freedom', 0.7003310322761536),
 ('Information', 0.697040319442749),
 ('Fairness', 0.6889863610267639),
 ('Presidential', 0.6873148679733276),
 ('Responsible', 0.6756091117858887),
 ('CRA', 0.6655154228210449)]

In [17]:
contents = np.asarray(contents)

In [18]:
samples = np.zeros(shape=(contents.shape[0], contents.shape[1], max_seq_length, word_dim))

In [19]:
for i in range(contents.shape[0]):
    for j in range(contents.shape[1]):
        for k in range(max_seq_length):
            samples[i, j, k, :] = word2vec.wv[contents[i, j][k]]

In [20]:
# TODO use closest cosine distance to find output word.

## Funciton Definitions

In [70]:
def squareError(xTrue, xPred):
    return K.square(xTrue - xPred)


In [93]:
def reconstructionLoss(sample, encoder, decoder, f_w, weight): # (L_1 from the paper)
    return K.mean(squareError(sample, decoder(encoder(sample)))) + K.mean(weight*K.log(f_w(encoder(sample))))


In [95]:
def divergenceLoss(f_w, encoder, sample, z_j, n_j): # Mean of log f_w(E_theta_i(x_j)) + log (1-f_w(z_j, n_j)) from the paper (L_2).
    return K.mean(K.log(f_w(encoder(sample)))) + K.mean(K.log(1 - f_w([z_j, n_j])))


In [25]:
def sample(data, domain, num_samples):
    N = data.shape[1]
    return tf.convert_to_tensor(data[domain, np.random.choice(N, num_samples, replace=True),:,:], dtype=tf.float32)


In [26]:
# Currently just doing a restriction to the last z variables, might want to do a matrix multiplication?
# pi_Z from the paper. projects a latent distribution in (z, n) to z
def projectZ(encoded):
    return encoded[0] # take zs.

In [27]:
def projectN(encoded):
    return encoded[1] # taek Ns.

In [105]:
# takes in two inputs, n and z, and outputs samples.
def createDecoder(z_dims, n_dims, time_steps, output_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?

    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
#     # 150 is arbitrary rn...
#     dense = Dense(150)(inputs)
    dense = Dense(time_steps*output_dims)(inputs)
    reshape = Reshape((time_steps, output_dims))(dense)
    # TODO Reshape to enforce time_steps?
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=True))(reshape)
    bilstm = Dropout(0.2)(bilstm)
    bilstm = Bidirectional(LSTM(32, activation='tanh', return_sequences=False))(bilstm)
    
    dense = Dense(time_steps*output_dims, activation='linear')(bilstm)
    outputs = Reshape((time_steps, output_dims))(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=outputs)
    
    return model

In [106]:
def createEncoder(time_steps, input_num, z_dims, n_dims):
    # TODO MAYBE: Add in more regularization or different than dropout?
    inputs = Input(shape=(time_steps, input_num,))
    bilstm = Bidirectional(LSTM(input_num, activation='tanh', return_sequences=True))(inputs)
    bilstm = Dropout(0.2)(bilstm)
    dense = Bidirectional(LSTM(32, activation='tanh', return_sequences=False))(bilstm)
    dense = Dropout(0.5)(dense)
    z_output = Dense(z_dims, activation='linear')(dense)
    n_output = Dense(n_dims, activation='linear')(dense)
    
    model = Model(inputs=inputs, outputs=[z_output, n_output])
    
    return model

In [107]:
def createDiscriminator(z_dims, n_dims):
    z_inputs = Input(shape=(z_dims,))
    n_inputs = Input(shape=(n_dims,))
    inputs = concatenate([z_inputs, n_inputs])
    
    # 150, 100 is arbitrary rn...
    dense = Dense(150, activation='relu')(inputs)
    dense = Dense(100, activation='relu')(dense)
    output = Dense(1, activation='sigmoid')(dense)
    
    model = Model(inputs=[z_inputs, n_inputs], outputs=output)
    
    return model

In [108]:
enc_optimizer = tf.keras.optimizers.Adam(5e-3)
dec_optimizer = tf.keras.optimizers.Adam(5e-3)
disc_optimizer = tf.keras.optimizers.Adam(5e-3)

### When $P_Z$ is known... 

In [152]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# original_domains is a list of the original domains P_z was derived from.

# Currently assuming P_Z is known. Must approximate P_Z first.
def trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
        
    
    for i in range(k):
        if i not in original_domains:
            original_domain = np.random.choice(original_domains)
            encoder = encoders[i]
            decoder = decoders[i]
            original_encoder = encoders[original_domain]
            epoch = 0
            while(epoch < 3): # TOOD: could also do until some convergence criteria.
                p_Xi_samples = sample(samples, i, num_samples)
                p_Z_samples = projectZ(original_encoder(sample(samples, original_domain, num_samples)))
                p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                    reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)

                    # negative b/c gradient ascent.
                    divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Z_samples, p_Ni_samples)

                gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)


                enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                
                print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                epoch+=1
            

### When $P_Z$ is unknown...
"A straight-forward approach for learning the latent distribution PZ is to train a regularized autoencoder on data from a
single representative domain. However, such a representation could potentially capture variability that is specific to
that one domain. To learn a more invariant latent representation, we propose the following extension of our autoencoder
framework. The basic idea is to alternate between training
multiple autoencoders until they agree on a latent representation that is effective for their respective domains. This is
particularly relevant for applications to biology; for example, often one is interested in learning a latent representation
that integrates all of the data modalities."

In [136]:

# k is num of domains.
# encoders is a list of encoders.
# decoders is list of decoders.
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep
# domains is a list of the domains we are currently training over.

def trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, domains, weight=1.0):
    N = samples.shape[1]
    k = samples.shape[0]
    
    for i in domains:
        encoder = encoders[i]
        decoder = decoders[i]
        for j in domains:
            if i != j:
                j_encoder = encoders[j]
                epoch = 0
                while(epoch < 3): # TOOD: could also do until some convergence criteria.
                    p_Xi_samples = sample(samples, i, num_samples)
                    p_Zj_samples = projectZ(j_encoder(sample(samples, j, num_samples)))
                    p_Ni_samples = projectN(encoder(sample(samples, i, num_samples)))

                    with tf.GradientTape() as enc_tape, tf.GradientTape() as dec_tape, tf.GradientTape() as disc_tape:

                        reconstruction_loss = reconstructionLoss(p_Xi_samples, encoder, decoder, discriminator, weight)
#                         print(p_Xi_samples)

                        # negative b/c gradient ascent.
                        divergence_loss = -1 * divergenceLoss(discriminator, encoder, p_Xi_samples, p_Zj_samples, p_Ni_samples)
#                         print(p_Zj_samples)
#                         print(p_Ni_samples)
                        
                    gradients_of_encoder = enc_tape.gradient(reconstruction_loss, encoder.trainable_variables)
                    gradients_of_decoder = dec_tape.gradient(reconstruction_loss, decoder.trainable_variables)
                    gradients_of_discriminator = disc_tape.gradient(divergence_loss, discriminator.trainable_variables)
            

                    enc_optimizer.apply_gradients(zip(gradients_of_encoder, encoder.trainable_variables))
                    dec_optimizer.apply_gradients(zip(gradients_of_decoder, decoder.trainable_variables))
                    disc_optimizer.apply_gradients(zip(gradients_of_discriminator, discriminator.trainable_variables))
                    
                    print('Domain {}, Epoch {}:\n\tReconstruction Loss: {}\n\tDivergence Loss: {}'.format(i, epoch+1, reconstruction_loss, divergence_loss))
                    epoch+=1
            

In [137]:
# samples is a K x N x Timesteps x dim, array of samples, where the 0th index is the domain,
# the 1th index is the # of the sample in that domain, 2th index is the # timesteps per sequence, 3th index is the #
# of dimensions at each timestep

def initModel(samples, z_dims, n_dims):
    
    k = samples.shape[0]
    N = samples.shape[1]
    time_steps = samples.shape[2]
    dim = samples.shape[3]
    
    
    
    discriminator = createDiscriminator(z_dims, n_dims)
    
    encoders = []
    decoders = []
    
    for i in range(k):
        encoders.append(createEncoder(time_steps, dim, z_dims, n_dims))
        decoders.append(createDecoder(z_dims, n_dims, time_steps, dim))
    
    return encoders, decoders, discriminator

In [138]:
def translate(start_sequences, samples, encoders, decoders, start_domain, end_domain):
    N = samples.shape[1]
    print(start_sequences.shape)
    num_samples = start_sequences.shape[0]
    
    start_encoder = encoders[start_domain]
    end_encoder = encoders[end_domain]
    end_decoder = decoders[end_domain]
    
    z = projectZ(start_encoder(start_sequences))
    n = projectN(end_encoder(sample(samples, end_domain, num_samples)))
    
    end_sequences = end_decoder([z, n])
    return end_sequences
    

In [139]:
def vecSeqToSentence(sequence):
    sequence = K.eval(sequence)
    sentence = []
    for i in range(sequence.shape[0]):
        word = sequence[i,:]
#         print(word)
#         print(word2vec.wv.similar_by_vector(word))
        sentence.append(word2vec.wv.similar_by_vector(word)[0][0])
    print(sentence)

In [140]:
n_dims = 50 # len(n)
z_dims = 50 # len(Z)

num_epochs = 1
num_samples = 100

original_domains = [0, 1]



In [141]:
# samples = tf.convert_to_tensor(samples)

In [142]:
encoders, decoders, discriminator = initModel(samples, z_dims, n_dims)

In [143]:
' '.join(contents[0, 0])

'On Tuesday ’ s broadcast of CNN ’ s “ Situation Room , ” CNN Senior Washington Correspondent Jeff Zeleny stated that Chelsea Manning ’ s transition from a man to a woman “ certainly played into ” President Obama ’ s decision to commute Manning ’ s sentence , and “ Without that , it ’ s hard to imagine , I think , this president would have done that . ” Zeleny said , “ I think a question that this president the White House will have to answer here — will answer , and I think it ’ s an important one , if — how much was the personal story of Chelsea Manning involved in this , because the outcry from the left was so strong on this . And she ’ s having a difficult time in federal prison , no question . But , to me , that is a central question here . Without that , you have to wonder if the outcome would be the same . I think it might not be . ” He added , “ [ B ] ecause she transitioned from a man to a woman , I think all of that certainly played into this . ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~?@_ ~

In [144]:
seq = tf.convert_to_tensor(np.asarray([samples[0, 0, :, :]]), dtype=tf.float32)
translation = translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])

(1, 307, 100)


In [146]:

vecSeqToSentence(translation[0,:,:])

['Corsica', 'harbours', 'Assurance', 'not', 'Hoerbranz', 'cohered', '6,000-strong', 'ini', 'klansman', 'dudgeon', 'Seltzer', 'Splitting', 'WKMG', 'incarcerating', 'Suchitra', 'givemecookies', 'losses.', 'internists', 'intranet', 'recalled', 'Co-operation', 'unsolvable', 'Poms', 'monolingual', 'Vannevar', 'pap', 'Licht', 'guinea', 'butt.', '50mg', 'Montengro', "'Throughout", 'Hounds', 'biweekly', 'LAUGHTER', 'Fung', 'Femi', 'pigeonholed', 'secular.', 'sheikhs', 'idle', 'perused', 'Zawia', 'épater', 'City', 'recommitment', '31m', 'pulverizes', '1561', 'SB193', 'Fraternal', 'Manteca', 'Ryen', 'Chiemingo', 'newspapers', 'Chaikin', 'depends', 'hollows', '464,000', 'Purcellville', 'double-digit', 'Haka', '572,900', 'correctly.', 'unmemorable', 'Unprocessed', '1:27:34', '1981.', 'A1C', 'adopter', 'Instagramming', 'Carolinians', 'Photoshopped', 'Cottom', 'gamesmanship', 'arguable', 'glitziest', 'Rapides', 'snowboards', 'unconcern', 'Reidar', 'Hegeman', 'stooping', 'embrace', 'yeah.', 'causatio

In [147]:
trainAutoencodersInitial(samples, encoders, decoders, discriminator, num_samples, original_domains, weight=1.0, epochs=10)


Epoch 1:
	Reconstruction Loss: 0.9046552181243896
	Divergence Loss: 1.3308215141296387
Epoch 2:
	Reconstruction Loss: 0.8651494979858398
	Divergence Loss: 1.3565409183502197
Epoch 3:
	Reconstruction Loss: 0.7926397919654846
	Divergence Loss: 1.372704267501831
Epoch 1:
	Reconstruction Loss: 0.8889249563217163
	Divergence Loss: 1.4558162689208984
Epoch 2:
	Reconstruction Loss: 0.7556953430175781
	Divergence Loss: 1.5280747413635254
Epoch 3:
	Reconstruction Loss: 0.6468750238418579
	Divergence Loss: 1.6357190608978271


In [150]:
translate(seq, samples, encoders, decoders, original_domains[0], original_domains[1])
vecSeqToSentence(translation[0,:,:])

(1, 307, 100)
['Corsica', 'harbours', 'Assurance', 'not', 'Hoerbranz', 'cohered', '6,000-strong', 'ini', 'klansman', 'dudgeon', 'Seltzer', 'Splitting', 'WKMG', 'incarcerating', 'Suchitra', 'givemecookies', 'losses.', 'internists', 'intranet', 'recalled', 'Co-operation', 'unsolvable', 'Poms', 'monolingual', 'Vannevar', 'pap', 'Licht', 'guinea', 'butt.', '50mg', 'Montengro', "'Throughout", 'Hounds', 'biweekly', 'LAUGHTER', 'Fung', 'Femi', 'pigeonholed', 'secular.', 'sheikhs', 'idle', 'perused', 'Zawia', 'épater', 'City', 'recommitment', '31m', 'pulverizes', '1561', 'SB193', 'Fraternal', 'Manteca', 'Ryen', 'Chiemingo', 'newspapers', 'Chaikin', 'depends', 'hollows', '464,000', 'Purcellville', 'double-digit', 'Haka', '572,900', 'correctly.', 'unmemorable', 'Unprocessed', '1:27:34', '1981.', 'A1C', 'adopter', 'Instagramming', 'Carolinians', 'Photoshopped', 'Cottom', 'gamesmanship', 'arguable', 'glitziest', 'Rapides', 'snowboards', 'unconcern', 'Reidar', 'Hegeman', 'stooping', 'embrace', 'yea

In [153]:
trainAutoencodersWithPz(samples, encoders, decoders, discriminator, num_samples, original_domains, weight=1.0)

Domain 2, Epoch 1:
	Reconstruction Loss: 0.48000383377075195
	Divergence Loss: 1.581259846687317
Domain 2, Epoch 2:
	Reconstruction Loss: 0.4359170198440552
	Divergence Loss: 1.6108452081680298
Domain 2, Epoch 3:
	Reconstruction Loss: 0.3379688262939453
	Divergence Loss: 1.640911340713501


## Evaluation with Rouge

In [None]:
detok = TreebankWordDetokenizer()

evaluator = rouge.Rouge(metrics=['rouge-n', 'rouge-l', 'rouge-w'],
                        max_n=4,
                        limit_length=True,
                        length_limit=100,
                        length_limit_type='words',
                        apply_avg=False,
                        apply_best=True,
                        alpha=0.5, # Default F1_score
                        weight_factor=1.2,
                        stemming=True)

In [None]:
def evaluateOnArticles(articles, encoder, decoder):
    translated = decoder(encoder(articles))
       
    original_sentences = [detok.detokenize(tokens) for tokens in articles]
    
    translated_sentences = [vecSeqToSentence(tokens) for tokens in translated]
    
    scores = evaluator.get_scores(translated_sentences, original_sentences)
    
    for metric, results in sorted(scores.items(), key=lambda x: x[0]):
        print('\t{}:\t{}: {:5.2f}\t{}: {:5.2f}\t{}: {:5.2f}'.format(metric, 'P', 100.0 * results['p'], 'R', 100.0 * results['r'], 'F1', 100.0 * results['f']))

In [None]:
def evaluate(articles_df, encoders, decoders):
    
    publications = articles_df.publication.unique()
    for i in range(len(publications)):
        for j in range(len(publications)):
            if (i != j):
                pub1=publications[i]
                pub2=publications[j]
                source_articles = articles_df.loc[articles_df['publication'] == pub1]['content'].tolist()

                print(pub1,"to",pub2)
                evaluateOnArticles(source_articles, encoders[i], decoders[j])
                print()