In [1]:
%load_ext autoreload

from IPython.display import clear_output

In [23]:
%autoreload

import os
import sys
import pdb

import logging
import numpy as np
import cPickle
import codecs
import itertools

import tensorflow as tf
import pyrouge

from data_structure import load_data
from model import StrSumModel
from run import run
from rouge import rouge_n, rouge_l_sentence_level

In [11]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

# cli.py

In [12]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

In [13]:
flags = tf.app.flags
config = flags.FLAGS

flags.DEFINE_string('gpu', '0', 'visible gpu')

flags.DEFINE_string('mode', 'train', 'set train or eval')

flags.DEFINE_string('datadir', 'data', 'datadir')
flags.DEFINE_string('dataname', 'sports_fined.pkl', 'dataname')
flags.DEFINE_string('modeldir', 'model', 'modeldir')
flags.DEFINE_string('modelname', 'sports', 'modelname')

flags.DEFINE_bool('discourserank', True, 'discourserank')
flags.DEFINE_float('damp', 0.9, 'damping factor of discourserank')

flags.DEFINE_integer('epochs', 1000, 'epochs')
flags.DEFINE_integer('batch_size', 8, 'batch_size')
flags.DEFINE_integer('log_period', 500, 'log_period')

flags.DEFINE_string('opt', 'Adagrad', 'optimizer')
flags.DEFINE_float('lr', 0.1, 'lr')
flags.DEFINE_float('norm', 1e-4, 'norm')
flags.DEFINE_float('grad_clip', 10.0, 'grad_clip')
flags.DEFINE_float('keep_prob', 0.95, 'keep_prob')
flags.DEFINE_integer('beam_width', 10, 'beam_width')
flags.DEFINE_float('length_penalty_weight', 0.0, 'length_penalty_weight')

flags.DEFINE_integer('dim_hidden', 256, 'dim_output')
flags.DEFINE_integer('dim_str', 128, 'dim_output')
flags.DEFINE_integer('dim_sent', 384, 'dim_sent')

# for evaluation
flags.DEFINE_string('refdir', 'ref', 'refdir')
flags.DEFINE_string('outdir', 'out', 'outdir')


flags.DEFINE_string('f', '', 'kernel')

In [5]:
os.environ['CUDA_VISIBLE_DEVICES'] = config.gpu

In [7]:
train_batches, dev_batches, test_batches, embedding_matrix, vocab, word_to_id  = load_data(config)

Number of train examples: 37445


In [14]:
n_embed, d_embed = embedding_matrix.shape
flags.DEFINE_integer('n_embed', n_embed, 'n_embed')
flags.DEFINE_integer('d_embed', d_embed, 'd_embed')

maximum_iterations = max([max([d._max_sent_len(None) for d in batch]) for ct, batch in dev_batches])
flags.DEFINE_integer('maximum_iterations', maximum_iterations, 'maximum_iterations')
flags.DEFINE_integer('PAD_IDX', word_to_id[PAD], 'PAD_IDX')
flags.DEFINE_integer('UNK_IDX', word_to_id[UNK], 'UNK_IDX')
flags.DEFINE_integer('BOS_IDX', word_to_id[BOS], 'BOS_IDX')
flags.DEFINE_integer('EOS_IDX', word_to_id[EOS], 'EOS_IDX')

# train.py 

In [15]:
PAD = '<pad>' # This has a vocab id, which is used to pad the encoder input, decoder input and target sequence
UNK = '<unk>' # This has a vocab id, which is used to represent out-of-vocabulary words
BOS = '<p>' # This has a vocab id, which is used at the beginning of every decoder input sequence
EOS = '</p>' # This has a vocab id, which is used at the end of untruncated target sequences

In [17]:
%autoreload

from model import StrSumModel

model = StrSumModel(config)
model.build()

In [18]:
if 'sess' in globals(): sess.close()
sess = tf.Session()
gvi = tf.global_variables_initializer()
sess.run(gvi)
sess.run(model.embeddings.assign(embedding_matrix.astype(np.float32)))

array([[ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [ 0.    ,  0.    ,  0.    , ...,  0.    ,  0.    ,  0.    ],
       [-0.201 ,  0.3212, -0.027 , ...,  0.1667, -0.0982, -0.0186],
       ...,
       [-1.3045,  0.3266,  0.5752, ...,  0.5891, -0.035 , -0.6161],
       [ 0.0143,  0.353 , -0.5258, ...,  0.5506, -0.2468,  0.3779],
       [ 0.1307,  0.0051, -0.2055, ..., -0.11  , -0.2712,  0.3053]],
      dtype=float32)

In [28]:
def get_txt_from_idx(idxs, model, vocab):
    return [' '.join([vocab[idx] for idx in idxs if (idx != model.config.EOS_IDX and idx != model.config.PAD_IDX)])]

def get_txt_from_tokens(tokens):
    return [' '.join([token for token in l]) for l in tokens]

def get_rouges(sess, model, batch, vocab, modes=[1, 2, 'l']):
    feed_dict = model.get_feed_dict(batch, mode='test')
    batch_root_token_idxs = sess.run(model.root_token_idxs, feed_dict = feed_dict)
    rouges = []
    for instance, root_token_idxs in zip(batch, batch_root_token_idxs):
        out_tokens = get_txt_from_idx(root_token_idxs, model, vocab)
        ref_tokens = get_txt_from_tokens([instance.summary_tokens])
        
        rouge_1_f1 = rouge_n(out_tokens, ref_tokens, 1)[0]
        rouge_2_f1 = rouge_n(out_tokens, ref_tokens, 2)[0]
        rouge_l_f1 = rouge_l_sentence_level(out_tokens, ref_tokens)[0]
        
        rouge_batch = [rouge_1_f1, rouge_2_f1, rouge_l_f1]
        rouges.append(rouge_batch)
    return rouges

In [29]:
def evaluate(sess, batches, model, vocab):
    losses, rouges = [], []
    for ct, batch in batches:
        feed_dict = model.get_feed_dict(batch, mode='test')
        loss_batch = sess.run(model.loss, feed_dict = feed_dict)
        rouge_batch = get_rouges(sess, model, batch, vocab)
        losses += [loss_batch]
        rouges += rouge_batch
        
    loss_mean = np.mean(losses)
    rouge_mean = tuple(np.mean(rouges, 0))
    return loss_mean, rouge_mean

In [None]:
losses_train = []
loss_log = []
rouge_log = []

saver = tf.train.Saver(max_to_keep=20)
if len(loss_log) == 0:
    import subprocess
    
    cmd_rm = 'rm -r %s' % config.modeldir
    res = subprocess.call(cmd_rm.split())

    cmd_mk = 'mkdir %s' % config.modeldir
    res = subprocess.call(cmd_mk.split())

for ct, batch in train_batches:
    feed_dict = model.get_feed_dict(batch)
    _, loss_train = sess.run([model.opt, model.loss], feed_dict = feed_dict)
    losses_train += [loss_train]
    if ct%config.log_period==0:
        loss_train = np.mean(losses_train)
        loss_dev, rouge_dev = evaluate(sess, dev_batches, model, vocab)

        if len(rouge_log) == 0:
            do_test = True
        else:
            norm = np.mean(np.array(zip(*rouge_log))[:3], 1)
            if 0.0 in norm: norm = np.array([1, 1, 1], dtype=np.float32)
            rouge_judge = np.sum(np.array(rouge_dev)/norm)
            rouge_max = np.max(np.sum(np.array(zip(*rouge_log))[:3]/norm[:, np.newaxis], 0))
            do_test = (rouge_max <= rouge_judge)

        if do_test:
            loss_test, rouge_test = evaluate(sess, test_batches, model, vocab)
            modelpath = os.path.join(config.modeldir, config.modelname)
            saver.save(sess, modelpath, global_step=ct)
        else:
            loss_test = zip(*loss_log)[3][-1]
            rouge_test = tuple(np.array(zip(*rouge_log))[3:, -1])

        loss_log += [(ct, loss_train, loss_dev, loss_test)]
        rouge_log += [rouge_dev + rouge_test]
        losses_train = []

        clear_output()
        for i in range(len(loss_log)): 
            print 'Step: %i | LOSS TRAIN: %.3f, DEV: %.3f, TEST: %.3f ' %  loss_log[i], 
            print '| DEV ROUGE-1: %.3f, -2: %.3f, -L: %.3f | TEST ROUGE: -1: %.3f, -2: %.3f, -L: %.3f' % rouge_log[i]

Step: 1000 | LOSS TRAIN: 6.450, DEV: 6.294, TEST: 6.319  | DEV ROUGE-1: 0.055, -2: 0.001, -L: 0.007 | TEST ROUGE: -1: 0.049, -2: 0.001, -L: 0.006
Step: 1500 | LOSS TRAIN: 6.143, DEV: 6.035, TEST: 6.044  | DEV ROUGE-1: 0.055, -2: 0.002, -L: 0.046 | TEST ROUGE: -1: 0.042, -2: 0.001, -L: 0.036
Step: 2000 | LOSS TRAIN: 5.981, DEV: 5.886, TEST: 5.910  | DEV ROUGE-1: 0.057, -2: 0.005, -L: 0.049 | TEST ROUGE: -1: 0.058, -2: 0.004, -L: 0.049
Step: 2500 | LOSS TRAIN: 5.862, DEV: 5.827, TEST: 5.846  | DEV ROUGE-1: 0.070, -2: 0.006, -L: 0.056 | TEST ROUGE: -1: 0.071, -2: 0.006, -L: 0.058
Step: 3000 | LOSS TRAIN: 5.801, DEV: 5.742, TEST: 5.846  | DEV ROUGE-1: 0.068, -2: 0.004, -L: 0.053 | TEST ROUGE: -1: 0.071, -2: 0.006, -L: 0.058
Step: 3500 | LOSS TRAIN: 5.739, DEV: 5.675, TEST: 5.846  | DEV ROUGE-1: 0.072, -2: 0.004, -L: 0.054 | TEST ROUGE: -1: 0.071, -2: 0.006, -L: 0.058
Step: 4000 | LOSS TRAIN: 5.690, DEV: 5.628, TEST: 5.846  | DEV ROUGE-1: 0.055, -2: 0.002, -L: 0.038 | TEST ROUGE: -1: 0.071,

# eval.py

In [249]:
# num_examples,  train_batches, dev_batches, test_batches, embedding_matrix, vocab, word_to_id = load_data(config)

In [250]:
def write_files(write_dir, sents_dict):
    for idx, sents in sents_dict.items():
        file_path = os.path.join(write_dir, "%04d.txt" % idx)

        f = codecs.open(file_path, mode="w", encoding="utf-8")
        for i, sent in enumerate(sents):
            f.write(sent) if i==len(sents)-1 else f.write(sent+"\n")

        f.close()

In [251]:
def write_ref(batches, config):
    instances = list(itertools.chain.from_iterable([batch for _, batch in batches]))
    ref_sents_dict = {}
    for ct, batch in batches:
        for instance in batch:
            ref_sents = [' '.join(instance.summary_tokens)]
            ref_sents_dict[instance.idx] = ref_sents

    write_files(config.refdir, ref_sents_dict)

In [252]:
def write_out(batches, config, vocab):
    with tf.Session() as sess:
        ckpt = tf.train.get_checkpoint_state(config.modeldir)
        model_path = ckpt.all_model_checkpoint_paths[-1]

        saver = tf.train.import_meta_graph(model_path + '.meta')
        saver.restore(sess, model_path)

        instances = list(itertools.chain.from_iterable([batch for _, batch in batches]))
        out_sents_dict = {}
        for ct, batch in batches:
            feed_dict = model.get_feed_dict(batch, mode='test')
            output_token_idxs_batch = sess.run(model.summary_output_token_idxs, feed_dict = feed_dict)
            for output_token_idxs, instance in zip(output_token_idxs_batch, batch):
                idx = instance.idx
                out_sents = get_txt_from_idx(output_token_idxs, model, vocab)
                out_sents_dict[idx] = out_sents

        write_files(config.outdir, out_sents_dict)

In [272]:
def print_pyrouge(config):
    logging.getLogger('global').setLevel(logging.WARNING) # silence pyrouge logging

    r = pyrouge.Rouge155()
    r.system_filename_pattern = '(\d+).txt'
    r.model_filename_pattern = '#ID#.txt'
    
    r.system_dir = config.outdir
    r.model_dir = config.refdir

    rouge_results = r.convert_and_evaluate()
    rouge_dict = r.output_to_dict(rouge_results)
    
    print rouge_results

In [267]:
write_ref(test_batches, config)

In [268]:
write_out(test_batches, config, vocab)

INFO:tensorflow:Restoring parameters from model/sports-43500


In [271]:
print_pyrouge(config)

---------------------------------------------
1 ROUGE-1 Average_R: 0.13499 (95%-conf.int. 0.11759 - 0.15319)
1 ROUGE-1 Average_P: 0.09440 (95%-conf.int. 0.07995 - 0.10897)
1 ROUGE-1 Average_F: 0.09712 (95%-conf.int. 0.08402 - 0.11083)
---------------------------------------------
1 ROUGE-2 Average_R: 0.02487 (95%-conf.int. 0.01417 - 0.03775)
1 ROUGE-2 Average_P: 0.01454 (95%-conf.int. 0.00758 - 0.02334)
1 ROUGE-2 Average_F: 0.01519 (95%-conf.int. 0.00856 - 0.02288)
---------------------------------------------
1 ROUGE-L Average_R: 0.13132 (95%-conf.int. 0.11451 - 0.14908)
1 ROUGE-L Average_P: 0.09084 (95%-conf.int. 0.07705 - 0.10491)
1 ROUGE-L Average_F: 0.09378 (95%-conf.int. 0.08138 - 0.10706)

