In [1]:
import os
import sys
import textwrap

import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.client import device_lib
from configuration import config
from utils import data_utils
from ved_var_attn import VarSeq2SeqVarAttnModel

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
print(device_lib.list_local_devices())
wrapper = textwrap.TextWrapper(width=100)

[name: "/device:CPU:0"
device_type: "CPU"
memory_limit: 268435456
locality {
}
incarnation: 9173897902608761951
]


In [3]:
current_dir = os.path.abspath('./')
data_dir = current_dir + '/data/'
outputs_dir = current_dir + '/outputs/'
arch_dir = outputs_dir + 'ved-var-attn/'
outputs_data_dir = outputs_dir + 'data/'

logs_dir = arch_dir + 'summary/'
log_str_dir = arch_dir + 'outcome/'
model_checkpoint_dir = arch_dir + 'checkpoints/var-seq2seq-with-atten-'
bleu_path = arch_dir + 'bleu/det-seq2seq-var-attn/'
w2v_dir = outputs_data_dir
w2v_path = w2v_dir + 'w2v_model_news.pkl'


if not os.path.exists(data_dir):
    os.makedirs(data_dir)

if not os.path.exists(outputs_dir):
    os.makedirs(outputs_dir)

if not os.path.exists(arch_dir):
    os.makedirs(arch_dir)

print('Data dir:', data_dir)
print('Outputs dir:', outputs_dir)
print('Arch dir:', arch_dir)

config['data_dir'] = data_dir
config['logs_dir'] = logs_dir
config['log_str_dir'] = log_str_dir
config['model_checkpoint_dir'] = model_checkpoint_dir
config['bleu_path'] = bleu_path
config['wrapper'] = wrapper
config['w2v_dir'] = w2v_dir

input_type = 'content'
output_type = 'title'
decoder_filters = encoder_filters = '!"#$%&()*+/:;<=>@[\\]^`{|}~\t\n'

Data dir: /Users/m3hrdadfi/Projects/hooshvare/ved/data/
Outputs dir: /Users/m3hrdadfi/Projects/hooshvare/ved/outputs/
Arch dir: /Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/


In [4]:
config

{'experiment': '?',
 'num_samples': 80000,
 'preprocessing': True,
 'lstm_hidden_units': 100,
 'embedding_size': 300,
 'num_layers': 1,
 'encoder_vocab': 40000,
 'decoder_vocab': 40000,
 'encoder_num_tokens': 50,
 'decoder_num_tokens': 30,
 'dropout_keep_prob': 0.8,
 'initial_learning_rate': 0.005,
 'learning_rate_decay': 0.75,
 'min_learning_rate': 1e-05,
 'latent_dim': 100,
 'word_dropout_keep_probability': 0.75,
 'z_temp': 1.0,
 'attention_temp': 1.0,
 'use_hmean': True,
 'gamma_val': 0.1,
 'batch_size': 100,
 'n_epochs': 30,
 'logs_dir': '/Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/summary/',
 'log_str_dir': '/Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/outcome/',
 'model_checkpoint_dir': '/Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/checkpoints/var-seq2seq-with-atten-',
 'bleu_path': '/Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/bleu/det-seq2seq-var-attn/',
 'w2v_dir': '/Users/m3hrdadfi/Projects/hooshvare/ved/outputs

In [6]:
print('[INFO] Importing the data')
data_sources = [
    os.path.join(config['data_dir'], 'articles1.csv'),
    os.path.join(config['data_dir'], 'articles2.csv'),
    os.path.join(config['data_dir'], 'articles3.csv'),
]
data = data_utils.create_news_data(
    data_sources,
    num_samples=config['num_samples'],
    preprocessing=config['preprocessing'])

[INFO] Importing the data


In [7]:
print('[INFO] Tokenizing input and output sequences')
input_sentences = data[input_type].values
output_sentences = data[output_type].values

print('Inputs:', len(input_sentences))
print('Outputs:', len(output_sentences))

x, word2idx_inputs, x_sen = data_utils.tokenize_sequence(
    sentences=input_sentences,
    max_num_words=config['encoder_num_tokens'],
    max_vocab_size=config['encoder_vocab'],
    filters=encoder_filters)

y, word2idx_outputs, y_sen = data_utils.tokenize_sequence(
    sentences=output_sentences,
    max_num_words=config['decoder_num_tokens'],
    max_vocab_size=config['decoder_vocab'],
    filters=decoder_filters)

[INFO] Tokenizing input and output sequences
Inputs: 79988
Outputs: 79988


In [8]:
print('[INFO] Split data into train-valid-test sets')
train_data, valid_data, test_data = data_utils.create_data_split(
    x=[x, x_sen],
    y=[y, y_sen],
    valid_size=.3,
    test_size=.5,
    verbose=True)
(x_train, y_train, x_sen_train, y_sen_train) = train_data
(x_valid, y_valid, x_sen_valid, y_sen_valid) = valid_data
(x_test, y_test, x_sen_test, y_sen_test) = test_data

[INFO] Split data into train-valid-test sets
[INFO] Training ...
X Shape: (55991, 50)
Y Shape: (55991, 30)

[INFO] Validating ...
X Shape: (11998, 50)
Y Shape: (11998, 30)

[INFO] Testing ...
X Shape: (11999, 50)
Y Shape: (11999, 30)



In [9]:
print('[INFO] Embeddings vector and matrix')

encoder_embeddings_matrix = data_utils.create_embedding_matrix(
    word_index=word2idx_inputs,
    embedding_dim=config['embedding_size'],
    w2v_path=w2v_path)

decoder_embeddings_matrix = data_utils.create_embedding_matrix(
    word_index=word2idx_outputs,
    embedding_dim=config['embedding_size'],
    w2v_path=w2v_path)

# Re-calculate the vocab size based on the word_idx dictionary
config['encoder_vocab'] = len(word2idx_inputs)
config['decoder_vocab'] = len(word2idx_outputs)

[INFO] Embeddings vector and matrix


In [12]:
model = VarSeq2SeqVarAttnModel(
    config=config,
    encoder_embeddings_matrix=encoder_embeddings_matrix,
    decoder_embeddings_matrix=decoder_embeddings_matrix,
    encoder_word_index=word2idx_inputs,
    decoder_word_index=word2idx_outputs)

[INFO] Building models ...
----> [INFO] Create placeholders for inputs to the model.
----> [INFO] Create word embedding for encoder and decoder parts.
----> [INFO] Create encoder block.
----> [INFO] Create latent space.
----> [INFO] Create decoder block.


In [13]:
if config['load_checkpoint'] != 0:
    checkpoint = config['model_checkpoint_dir'] + str(config['load_checkpoint']) + '.ckpt'
else:
    checkpoint = tf.train.get_checkpoint_state(os.path.dirname('models/checkpoint')).model_checkpoint_path

print('checkpoint:', checkpoint)

checkpoint: /Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/checkpoints/var-seq2seq-with-atten-7.ckpt


In [14]:
preds = model.predict(checkpoint, x_test, y_test, y_sen_test)

INFO:tensorflow:Restoring parameters from /Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/checkpoints/var-seq2seq-with-atten-7.ckpt
BLEU 1 to 4 : 0.19 | 0.14 | 0.13 | 0.11


In [16]:
count = 1000
model.show_output_sentences(
    preds[:count],
    y_test[:count],
    x_sen_test[:count],
    y_sen_test[:count],
    '%s/%s' % (config['log_str_dir'], 'output_sentences.csv'))

Input:      Since Donald Trump named Stephen K. Bannon as his White House Senior Counselor and Reince Priebus as
his Chief of Staff, many media reports have emerged, speculating on several other names inside
Trump’s inner circle rumored to fill positions in the Trump administration.
Actual:     Speculation Circulates around President-elect Donald Trump’s Cabinet - Breitbart
Generated: the photographers of the trump unverified

Input:      MANILA — Since Rodrigo Duterte became president of the Philippines just over a month ago, promising
to get tough on crime by having the police and the military kill drug suspects, 420 people have been
killed in the campaign, according to tallies of police reports by the local news media.
Actual:     Body Count Rises as Philippine President Wages War on Drugs - The New York Times
Generated: the UNK of the UNK one

Input:      MUNICH — Years after World War II, American officials here entrusted more than 10, 000 confiscated
artworks to Bavarian authorit

issues by the time he went looking to buy an offshore company in 2010.
Actual:     Panama Papers include dozens of Americans tied to financial frauds
Generated: the UNK of the UNK

Input:      , I want to receive updates from partners and sponsors.
Actual:     The Edge: A Victory for Automatic Voter Registration
Generated: the UNK of the UNK

Input:      ’’ ’Tesla will unveil its Model 3 vehicle in Los Angeles at the end of March.’ ”Buzz before the
event has pushed Tesla stock back up above $230 per share, from a crater of around $140.
Actual:     A lot of people are making the same mistake about Tesla and the Model 3
Generated: the UNK of the UNK

Input:      , it can be downright dangerous.
Actual:     Confessions of a pissed-off flight attendant
Generated: the UNK of the make games building of the make games building of the make games building

Input:      Looks like Hampshire College has forgotten which country it is in: The school will no longer fly the
American flag on campus.
Ac

Input:      La La Land has taken the top honour at the Toronto film festival.
Actual:     La La Land tipped for Oscars glory after win at Toronto film festival
Generated: him let president is times a UNK

Input:      In Central California, the lawn is in a state of existential crisis.
Actual:     California’s Historic Drought Has Residents Getting Creative With Their Lawns
Generated: the UNK of watch

Input:      Maggie Haberman and Ashley Parker write in the New York Times: LAS VEGAS — Donald J. Trump has
shaken up his presidential campaign for the second time in two months, hiring a top executive from
the conservative website Breitbart News and promoting a senior adviser in an effort to right
Actual:     NY Times: Donald Trump, in Shake-Up, Hires Breitbart Executive Stephen K. Bannon for Top Campaign
Post - Breitbart
Generated: the trump teen is candidate me a UNK

Input:      (CNN) It might literally rain on Donald Trump’s parade.
Actual:     Weather for Trump’s inauguration looks g

Actual:     Wilbur Ross, Trump’s Commerce pick, offshored 2,700 jobs since 2004
Generated: the leader street stop clinton’s UNK

Input:      In an for Townhall, Never Trumper radio host Derek Hunter explains why he now feels compelled to
vote for Donald Trump.
Actual:     Radio Host Derek Hunter Breaks Ranks with Never Trump: ’The Media Needs to Be Destroyed’ - Breitbart
Generated: the UNK of the UNK

Input:      Sometimes, it actually pays to be a good person.
Actual:     Good Samaritan makes bank after returning $676 check
Generated: the new york not

Input:      Oil prices settled steady on Wednesday, erasing most of the day’s gains, after you. S. government
data showed crude inventories at peaks again despite strong refinery runs.
Actual:     Oil settles flat, far off day’s highs; you.S. build weighs
Generated: the UNK of the UNK of the UNK

Input:      Criminal charges are expected to be announced Wednesday as a result of the Michigan attorney
general’s investigation into the ongo

In [17]:
model.get_diversity_metrics(checkpoint, x_test, y_test)

INFO:tensorflow:Restoring parameters from /Users/m3hrdadfi/Projects/hooshvare/ved/outputs/ved-var-attn/checkpoints/var-seq2seq-with-atten-7.ckpt


  0%|          | 0/3 [00:00<?, ?it/s]

KeyboardInterrupt: 