In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [10]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import sys
import argparse
import subprocess
import pdb
import time
import random
import _pickle as cPickle
import matplotlib.pyplot as plt
import glob

%matplotlib inline

import numpy as np
import pandas as pd
import tensorflow as tf

from data_structure import get_batches
from hntm import HierarchicalNeuralTopicModel
from nhdp import nestedHierarchicalNeuralTopicModel
from tsgntm import TreeStructuredGaussianNeuralTopicModel
from tree import get_descendant_idxs
from evaluation import get_hierarchical_affinity, get_topic_specialization, print_topic_sample, print_topic_year
from configure import get_config

pd.set_option('display.max_columns', 30)

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load data & set config

In [3]:
config = get_config(nb_name)

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu
np.random.seed(config.seed)
random.seed(config.seed)

In [5]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))
train_batches = get_batches(instances_train, config.batch_size)
dev_batches = get_batches(instances_dev, config.batch_size)
test_batches = get_batches(instances_test, config.batch_size)
config.dim_bow = len(bow_idxs)

In [6]:
def debug(variable, sample_batch=None):
    if sample_batch is None: sample_batch = test_batches[0][1]
    feed_dict = model.get_feed_dict(sample_batch, mode='test')
    _variable = sess.run(variable, feed_dict=feed_dict)
    return _variable

def check(variable):
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sample_batch = test_batches[0]
    feed_dict = model.get_feed_dict(sample_batch, mode='test', assertion=True)
    _variable = sess.run(variable, feed_dict=feed_dict)
    return _variable

# model

## initialize model

In [7]:
if 'sess' in globals(): sess.close()
TopicModels = {'hntm': HierarchicalNeuralTopicModel, 'nhdp': nestedHierarchicalNeuralTopicModel, 'tsgntm': TreeStructuredGaussianNeuralTopicModel}
TopicModel = TopicModels[config.model]
model = TopicModel(config)
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=config.max_to_keep)
update_tree_flg = False

# run

## initialize log

In [8]:
checkpoint = []
losses_train = []
ppls_train = []
ppl_min = np.inf
epoch = 0
train_batches = get_batches(instances_train, config.batch_size, iterator=True)

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','','','','','','VALID:','','','','','','TEST:','', 'SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','LOSS','PPL','NLL','KL', 'GAUSS', 'REG','LOSS','PPL','NLL','KL', 'GAUSS','REG','LOSS','PPL', '1', '2', '3', 'CHILD', 'OTHER']]))))

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

def update_checkpoint(config, checkpoint, global_step):
    checkpoint.append(config.path_model + '-%i' % global_step)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0) + '.*'
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))
    
def validate(sess, batches, model):
    losses = []
    ppl_list = []
    prob_topic_list = []
    n_bow_list = []
    n_topics_list = []
    for ct, batch in batches:
        feed_dict = model.get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch, ppls_batch, prob_topic_batch, n_bow_batch, n_topics_batch \
            = sess.run([model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_gauss, model.topic_loss_reg, model.topic_ppls, model.prob_topic, model.n_bow, model.n_topics], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch]]
        ppl_list += list(ppls_batch)
        prob_topic_list.append(prob_topic_batch)
        n_bow_list.append(n_bow_batch)
        n_topics_list.append(n_topics_batch)
    loss_mean, topic_loss_recon_mean, topic_loss_kl_mean, topic_loss_gauss_mean, topic_loss_reg_mean = np.mean(losses, 0)
    ppl_mean = np.exp(np.mean(ppl_list))
    
    probs_topic = np.concatenate(prob_topic_list, 0)
    
    n_bow = np.concatenate(n_bow_list, 0)
    n_topics = np.concatenate(n_topics_list, 0)
    probs_topic_mean = np.sum(n_topics, 0) / np.sum(n_bow)
    
    return loss_mean, topic_loss_recon_mean, topic_loss_kl_mean, topic_loss_gauss_mean, topic_loss_reg_mean, ppl_mean, probs_topic_mean    

## train & validate model

In [9]:
time_start = time.time()
while epoch < config.n_epochs:
    # train
    for ct, batch in train_batches:
        feed_dict = model.get_feed_dict(batch)
        _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch, ppls_batch, global_step_log = \
        sess.run([model.opt, model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_gauss, model.topic_loss_reg, model.topic_ppls, tf.train.get_global_step()], feed_dict = feed_dict)

        losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch]]
        ppls_train += list(ppls_batch)

        if global_step_log % config.log_period == 0:
            # validate
            loss_train, topic_loss_recon_train, topic_loss_kl_train, topic_loss_gauss_train, topic_loss_reg_train = np.mean(losses_train, 0)
            ppl_train = np.exp(np.mean(ppls_train))
            loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_gauss_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)

            # test
            if ppl_dev < ppl_min:
                ppl_min = ppl_dev
                loss_test, _, _, _, _, ppl_test, _ = validate(sess, test_batches, model)
                saver.save(sess, config.path_model, global_step=global_step_log)
                cPickle.dump(config, open(config.path_config % global_step_log, 'wb'))
                update_checkpoint(config, checkpoint, global_step_log)
            
            # visualize topic
            topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
            topics_freq_idxs = bow_idxs[topics_freq_indices]
            topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
            topic_prob_topic = {topic_idx: prob_topic for topic_idx, prob_topic in zip(model.topic_idxs, probs_topic_dev)}
            descendant_idxs = {parent_idx: get_descendant_idxs(model, parent_idx) for parent_idx in model.topic_idxs}
            recur_prob_topic = {parent_idx: np.sum([topic_prob_topic[child_idx] for child_idx in recur_child_idxs]) for parent_idx, recur_child_idxs in descendant_idxs.items()}
            
            depth_specs = get_topic_specialization(sess, model, instances_test)
            hierarchical_affinities = get_hierarchical_affinity(sess, model)
            
            # log
            clear_output()
            time_log = int(time.time() - time_start)
            log_series = pd.Series([time_log, epoch, ct, \
                    '%.2f'%loss_train, '%.0f'%ppl_train, '%.2f'%topic_loss_recon_train, '%.2f'%topic_loss_kl_train, '%.2f'%topic_loss_gauss_train, '%.2f'%topic_loss_reg_train, \
                    '%.2f'%loss_dev, '%.0f'%ppl_dev, '%.2f'%topic_loss_recon_dev, '%.2f'%topic_loss_kl_dev, '%.2f'%topic_loss_gauss_dev, '%.2f'%topic_loss_reg_dev, \
                    '%.2f'%loss_test, '%.0f'%ppl_test, \
                    '%.2f'%depth_specs[1], '%.2f'%depth_specs[2], '%.2f'%depth_specs[3], \
                    '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
                    index=log_df.columns)
            log_df.loc[global_step_log] = log_series
            display(log_df)
            cPickle.dump(log_df, open(os.path.join(config.path_log), 'wb'))
            print_topic_sample(sess, model, topic_prob_topic=topic_prob_topic, recur_prob_topic=recur_prob_topic, topic_freq_tokens=topic_freq_tokens)
            print(np.sum(debug(model.topic_logvars), 1))
            
            # update tree
            if not config.static:
                config.tree_idxs, update_tree_flg = model.update_tree(topic_prob_topic, recur_prob_topic)
                if update_tree_flg:
                    print(config.tree_idxs)
                    name_variables = {tensor.name: variable for tensor, variable in zip(tf.global_variables(), sess.run(tf.global_variables()))} # store paremeters
                    if 'sess' in globals(): sess.close()
                    model = HierarchicalNeuralTopicModel(config)
                    sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
                    name_tensors = {tensor.name: tensor for tensor in tf.global_variables()}
                    sess.run([name_tensors[name].assign(variable) for name, variable in name_variables.items()]) # restore parameters
                    saver = tf.train.Saver(max_to_keep=1)
                
            time_start = time.time()

    train_batches = get_batches(instances_train, config.batch_size, iterator=True)
    epoch += 1

loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)
topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
topics_freq_idxs = bow_idxs[topics_freq_indices]
topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
topic_prob_topic = {topic_idx: prob_topic for topic_idx, prob_topic in zip(model.topic_idxs, probs_topic_dev)}
descendant_idxs = {parent_idx: get_descendant_idxs(model, parent_idx) for parent_idx in model.topic_idxs}
recur_prob_topic = {parent_idx: np.sum([topic_prob_topic[child_idx] for child_idx in recur_child_idxs]) for parent_idx, recur_child_idxs in descendant_idxs.items()}
display(log_df)
print_topic_sample(sess, model, topic_prob_topic=topic_prob_topic, recur_prob_topic=recur_prob_topic, topic_freq_tokens=topic_freq_tokens)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,VALID:,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,TEST:,Unnamed: 17_level_0,SPEC:,Unnamed: 19_level_0,Unnamed: 20_level_0,HIER:,Unnamed: 22_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,GAUSS,REG,LOSS,PPL,NLL,KL,GAUSS,REG,LOSS,PPL,1,2,3,CHILD,OTHER
5000,83,8,463,9276.58,1997,9235.06,16.74,24.78,0.1,9244.7,1880,9194.52,20.26,29.92,0.05,9245.12,1882,0.11,0.17,0.39,0.76,0.47
10000,79,17,360,9230.15,1918,9184.13,18.49,27.51,0.07,9222.16,1846,9171.93,19.92,30.31,0.04,9221.13,1845,0.12,0.19,0.42,0.7,0.41
15000,81,26,257,9209.26,1883,9161.5,19.03,28.71,0.06,9216.49,1836,9164.3,20.08,32.11,0.04,9214.81,1833,0.09,0.21,0.44,0.65,0.38
20000,81,35,154,9196.35,1862,9147.22,19.33,29.76,0.06,9207.23,1818,9153.26,20.09,33.88,0.03,9206.31,1816,0.15,0.26,0.46,0.63,0.36
25000,80,44,51,9186.93,1847,9136.7,19.56,30.64,0.05,9202.58,1809,9147.31,20.58,34.7,0.03,9204.09,1812,0.11,0.31,0.46,0.63,0.36
30000,81,52,515,9179.9,1835,9128.84,19.73,31.31,0.05,9199.79,1804,9144.6,20.59,34.6,0.03,9198.32,1804,0.14,0.29,0.48,0.54,0.33


0 R: 1.000 P: 0.005 test performance method training algorithm models context speech features error
   1 R: 0.193 P: 0.048 probability algorithm training models method gram segmentation probabilities score length
     11 R: 0.078 P: 0.078 tree parsing dependency parser grammar rules algorithm node trees rule
     12 R: 0.067 P: 0.067 translation english source target systems alignment phrase sentences pairs parallel
   2 R: 0.169 P: 0.058 level human systems context knowledge input speech semantic analysis sentences
     21 R: 0.060 P: 0.060 discourse structure semantic interpretation representation theory object clause sentences semantics
     22 R: 0.051 P: 0.051 speech user dialogue recognition utterances utterance users speaker human spoken
   3 R: 0.198 P: 0.033 models probability training vector algorithm parameter function distribution parameters performance
     31 R: 0.100 P: 0.100 features training feature performance learning test accuracy classifier classification experimen

KeyboardInterrupt: 

# analysis

In [11]:
def get_topic_year(sample_batches):
    probs_topics = []
    years = []
    for i, sample_batch in sample_batches:
        probs_topics_batch = sess.run(model.prob_topic, feed_dict=model.get_feed_dict(sample_batch, mode='test'))
        years_batch = [instance.year for instance in sample_batch]
        probs_topics += [probs_topics_batch]
        years += years_batch
    probs_topics = np.concatenate(probs_topics)
    years = np.array(years)

    topic_years = years.dot(probs_topics) / np.sum(probs_topics, 0)
    topic_year = {model.topic_idxs[i]: year for i, year in enumerate(topic_years)}
    return topic_year

In [12]:
sample_batches = get_batches(instances_train, config.batch_size)
topic_year = get_topic_year(sample_batches)
print_topic_year(sess, model, topic_freq_tokens=topic_freq_tokens, topic_year=topic_year)

0 Avg Year: 2006 test performance method training algorithm models context speech features error
   1 Avg Year: 2007 probability algorithm training models method gram segmentation probabilities score length
     11 Avg Year: 2005 tree parsing dependency parser grammar rules algorithm node trees rule
     12 Avg Year: 2010 translation english source target systems alignment phrase sentences pairs parallel
   2 Avg Year: 2005 level human systems context knowledge input speech semantic analysis sentences
     21 Avg Year: 2002 discourse structure semantic interpretation representation theory object clause sentences semantics
     22 Avg Year: 2008 speech user dialogue recognition utterances utterance users speaker human spoken
   3 Avg Year: 2010 models probability training vector algorithm parameter function distribution parameters performance
     31 Avg Year: 2011 features training feature performance learning test accuracy classifier classification experiments
     32 Avg Year: 2015 m