In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import sys
import argparse
import subprocess
import pdb
import time
import random
import _pickle as cPickle
import matplotlib.pyplot as plt
import glob

%matplotlib inline

import numpy as np
import pandas as pd
import tensorflow as tf

from data_structure import get_batches
from hntm import HierarchicalNeuralTopicModel
from nhdp import nestedHierarchicalNeuralTopicModel
from tsgntm import TreeStructuredGaussianNeuralTopicModel
from tree import get_descendant_idxs
from evaluation import get_hierarchical_affinity, get_topic_specialization, print_topic_sample, print_topic_year
from configure import get_config

pd.set_option('display.max_columns', 30)

# load data & set config

In [3]:
config = get_config(nb_name)

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu
np.random.seed(config.seed)
random.seed(config.seed)

In [5]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))
train_batches = get_batches(instances_train, config.batch_size)
dev_batches = get_batches(instances_dev, config.batch_size)
test_batches = get_batches(instances_test, config.batch_size)
config.dim_bow = len(bow_idxs)

In [None]:
def debug(variable, sample_batch=None):
    if sample_batch is None: sample_batch = test_batches[0][1]
    feed_dict = model.get_feed_dict(sample_batch, mode='test')
    _variable = sess.run(variable, feed_dict=feed_dict)
    return _variable

def check(variable):
    sess = tf.Session()
    sess.run(tf.global_variables_initializer())
    sample_batch = test_batches[0]
    feed_dict = model.get_feed_dict(sample_batch, mode='test', assertion=True)
    _variable = sess.run(variable, feed_dict=feed_dict)
    return _variable

# model

## initialize model

In [None]:
if 'sess' in globals(): sess.close()
TopicModels = {'hntm': HierarchicalNeuralTopicModel, 'nhdp': nestedHierarchicalNeuralTopicModel, 'tsgntm': TreeStructuredGaussianNeuralTopicModel}
TopicModel = TopicModels[config.model]
model = TopicModel(config)
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=config.max_to_keep)
update_tree_flg = False

# run

## initialize log

In [None]:
checkpoint = []
losses_train = []
ppls_train = []
ppl_min = np.inf
epoch = 0
train_batches = get_batches(instances_train, config.batch_size, iterator=True)

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','','','','','','VALID:','','','','','','TEST:','', 'SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','LOSS','PPL','NLL','KL', 'GAUSS', 'REG','LOSS','PPL','NLL','KL', 'GAUSS','REG','LOSS','PPL', '1', '2', '3', 'CHILD', 'OTHER']]))))

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

def update_checkpoint(config, checkpoint, global_step):
    checkpoint.append(config.path_model + '-%i' % global_step)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0) + '.*'
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))
    
def validate(sess, batches, model):
    losses = []
    ppl_list = []
    prob_topic_list = []
    n_bow_list = []
    n_topics_list = []
    for ct, batch in batches:
        feed_dict = model.get_feed_dict(batch, mode='test')
        loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch, ppls_batch, prob_topic_batch, n_bow_batch, n_topics_batch \
            = sess.run([model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_gauss, model.topic_loss_reg, model.topic_ppls, model.prob_topic, model.n_bow, model.n_topics], feed_dict = feed_dict)
        losses += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch]]
        ppl_list += list(ppls_batch)
        prob_topic_list.append(prob_topic_batch)
        n_bow_list.append(n_bow_batch)
        n_topics_list.append(n_topics_batch)
    loss_mean, topic_loss_recon_mean, topic_loss_kl_mean, topic_loss_gauss_mean, topic_loss_reg_mean = np.mean(losses, 0)
    ppl_mean = np.exp(np.mean(ppl_list))
    
    probs_topic = np.concatenate(prob_topic_list, 0)
    
    n_bow = np.concatenate(n_bow_list, 0)
    n_topics = np.concatenate(n_topics_list, 0)
    probs_topic_mean = np.sum(n_topics, 0) / np.sum(n_bow)
    
    return loss_mean, topic_loss_recon_mean, topic_loss_kl_mean, topic_loss_gauss_mean, topic_loss_reg_mean, ppl_mean, probs_topic_mean    

## train & validate model

In [None]:
time_start = time.time()
while epoch < config.n_epochs:
    # train
    for ct, batch in train_batches:
        feed_dict = model.get_feed_dict(batch)
        _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch, ppls_batch, global_step_log = \
        sess.run([model.opt, model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_gauss, model.topic_loss_reg, model.topic_ppls, tf.train.get_global_step()], feed_dict = feed_dict)

        losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_gauss_batch, topic_loss_reg_batch]]
        ppls_train += list(ppls_batch)

#         if global_step_log % config.log_period == 0:
        if global_step_log % 1000 == 0:
            # validate
            loss_train, topic_loss_recon_train, topic_loss_kl_train, topic_loss_gauss_train, topic_loss_reg_train = np.mean(losses_train, 0)
            ppl_train = np.exp(np.mean(ppls_train))
            loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_gauss_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)

            # test
            if ppl_dev < ppl_min:
                ppl_min = ppl_dev
                loss_test, _, _, _, _, ppl_test, _ = validate(sess, test_batches, model)
                saver.save(sess, config.path_model, global_step=global_step_log)
                cPickle.dump(config, open(config.path_config % global_step_log, 'wb'))
                update_checkpoint(config, checkpoint, global_step_log)
            
            # visualize topic
            topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
            topics_freq_idxs = bow_idxs[topics_freq_indices]
            topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
            topic_prob_topic = {topic_idx: prob_topic for topic_idx, prob_topic in zip(model.topic_idxs, probs_topic_dev)}
            descendant_idxs = {parent_idx: get_descendant_idxs(model, parent_idx) for parent_idx in model.topic_idxs}
            recur_prob_topic = {parent_idx: np.sum([topic_prob_topic[child_idx] for child_idx in recur_child_idxs]) for parent_idx, recur_child_idxs in descendant_idxs.items()}
            
            depth_specs = get_topic_specialization(sess, model, instances_test)
            hierarchical_affinities = get_hierarchical_affinity(sess, model)
            
            # log
            clear_output()
            time_log = int(time.time() - time_start)
            log_series = pd.Series([time_log, epoch, ct, \
                    '%.2f'%loss_train, '%.0f'%ppl_train, '%.2f'%topic_loss_recon_train, '%.2f'%topic_loss_kl_train, '%.2f'%topic_loss_gauss_train, '%.2f'%topic_loss_reg_train, \
                    '%.2f'%loss_dev, '%.0f'%ppl_dev, '%.2f'%topic_loss_recon_dev, '%.2f'%topic_loss_kl_dev, '%.2f'%topic_loss_gauss_dev, '%.2f'%topic_loss_reg_dev, \
                    '%.2f'%loss_test, '%.0f'%ppl_test, \
                    '%.2f'%depth_specs[1], '%.2f'%depth_specs[2], '%.2f'%depth_specs[3], \
                    '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
                    index=log_df.columns)
            log_df.loc[global_step_log] = log_series
            display(log_df)
            cPickle.dump(log_df, open(os.path.join(config.path_log), 'wb'))
            print_topic_sample(sess, model, topic_prob_topic=topic_prob_topic, recur_prob_topic=recur_prob_topic, topic_freq_tokens=topic_freq_tokens)
            print(np.sum(debug(model.topic_logvars), 1))
            
            # update tree
            if not config.static:
                config.tree_idxs, update_tree_flg = model.update_tree(topic_prob_topic, recur_prob_topic)
                if update_tree_flg:
                    print(config.tree_idxs)
                    name_variables = {tensor.name: variable for tensor, variable in zip(tf.global_variables(), sess.run(tf.global_variables()))} # store paremeters
                    if 'sess' in globals(): sess.close()
                    model = HierarchicalNeuralTopicModel(config)
                    sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
                    name_tensors = {tensor.name: tensor for tensor in tf.global_variables()}
                    sess.run([name_tensors[name].assign(variable) for name, variable in name_variables.items()]) # restore parameters
                    saver = tf.train.Saver(max_to_keep=1)
                
            time_start = time.time()

    train_batches = get_batches(instances_train, config.batch_size, iterator=True)
    epoch += 1

loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)
topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
topics_freq_idxs = bow_idxs[topics_freq_indices]
topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
topic_prob_topic = {topic_idx: prob_topic for topic_idx, prob_topic in zip(model.topic_idxs, probs_topic_dev)}
descendant_idxs = {parent_idx: get_descendant_idxs(model, parent_idx) for parent_idx in model.topic_idxs}
recur_prob_topic = {parent_idx: np.sum([topic_prob_topic[child_idx] for child_idx in recur_child_idxs]) for parent_idx, recur_child_idxs in descendant_idxs.items()}
display(log_df)
print_topic_sample(sess, model, topic_prob_topic=topic_prob_topic, recur_prob_topic=recur_prob_topic, topic_freq_tokens=topic_freq_tokens)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,Unnamed: 9_level_0,VALID:,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,Unnamed: 14_level_0,Unnamed: 15_level_0,TEST:,Unnamed: 17_level_0,SPEC:,Unnamed: 19_level_0,Unnamed: 20_level_0,HIER:,Unnamed: 22_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,GAUSS,REG,LOSS,PPL,NLL,KL,GAUSS,REG,LOSS,PPL,1,2,3,CHILD,OTHER
1000,36,0,999,929.55,2704,924.32,1.38,3.61,0.24,890.13,2567,883.67,2.34,3.85,0.27,887.98,2517,0.15,0.16,0.18,0.96,0.88
2000,30,0,1999,922.71,2548,916.51,2.16,3.82,0.22,882.04,2374,875.27,2.96,3.59,0.22,879.44,2347,0.18,0.18,0.21,0.94,0.84
3000,30,1,295,920.17,2474,913.60,2.47,3.89,0.21,878.16,2273,870.22,3.16,4.56,0.22,878.17,2310,0.24,0.21,0.21,0.93,0.79
4000,24,1,1295,917.60,2426,910.73,2.67,3.99,0.20,878.14,2284,870.07,3.55,4.33,0.19,878.17,2310,0.15,0.21,0.24,0.91,0.82
5000,30,1,2295,915.46,2390,908.41,2.80,4.05,0.20,873.58,2204,865.37,3.71,4.23,0.26,876.96,2287,0.20,0.20,0.24,0.92,0.79
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
633000,38,234,263,895.93,2016,886.57,4.37,4.72,0.13,862.15,1985,852.36,4.71,4.94,0.14,860.48,1977,0.22,0.27,0.35,0.78,0.58
634000,39,234,1263,895.93,2016,886.57,4.37,4.72,0.13,862.10,1979,852.29,4.73,4.94,0.15,860.48,1977,0.20,0.27,0.35,0.80,0.57
635000,39,234,2263,895.92,2016,886.56,4.37,4.72,0.13,862.42,1995,852.76,4.68,4.89,0.08,860.48,1977,0.22,0.30,0.34,0.80,0.59
636000,39,235,559,895.93,2016,886.56,4.37,4.72,0.13,861.68,1970,851.98,4.69,4.93,0.08,860.48,1977,0.16,0.25,0.34,0.84,0.56


0 R: 1.000 P: 0.050 cold tea drive asked large tasted slow flavor decent items
   1 R: 0.393 P: 0.044 rice soup tacos beef spicy thai fried fish dishes meat
     11 R: 0.190 P: 0.190 rice thai tacos mexican spicy soup beef rolls fried chinese
     12 R: 0.159 P: 0.159 steak waitress wine attentive fish beers cooked enjoyed bartender waiter
   2 R: 0.162 P: 0.007 coffee store shop tea drive free line donuts employees stop
     21 R: 0.058 P: 0.058 coffee tea donuts starbucks store milk boba shop latte iced
     22 R: 0.096 P: 0.096 sandwich cream ice chocolate sandwiches cake eggs flavors coffee bread
   3 R: 0.121 P: 0.010 free extra asked employees told delivery today ice worst drive
     31 R: 0.007 P: 0.007 manager asked wings told years wo half wanted left cold
     32 R: 0.104 P: 0.104 wings burgers crust pizzas pepperoni sandwich slice manager cold toppings
   4 R: 0.008 P: 0.003 years found helpful told asked wanted today free check care
     41 R: 0.001 P: 0.001 drive today emp

# analysis

In [21]:
def get_topic_year(sample_batches):
    probs_topics = []
    years = []
    for i, sample_batch in sample_batches:
        probs_topics_batch = sess.run(model.prob_topic, feed_dict=model.get_feed_dict(sample_batch, mode='test'))
        years_batch = [instance.year for instance in sample_batch]
        probs_topics += [probs_topics_batch]
        years += years_batch
    probs_topics = np.concatenate(probs_topics)
    years = np.array(years)

    topic_years = years.dot(probs_topics) / np.sum(probs_topics, 0)
    topic_year = {model.topic_idxs[i]: year for i, year in enumerate(topic_years)}
    return topic_year

In [22]:
sample_batches = get_batches(instances_train, config.batch_size)
topic_year = get_topic_year(sample_batches)
print_topic_year(sess, model, topic_freq_tokens=topic_freq_tokens, topic_year=topic_year)

0 Avg Year: 2005 level analysis english systems domain lexical linguistic test process lexicon
   1 Avg Year: 2000 grammar structure rules semantic form representation rule lexical type structures
     11 Avg Year: 2006 verb semantic syntactic verbs event relations discourse argument relation arguments
     12 Avg Year: 2007 tree parsing dependency parser grammar trees node parse rules syntactic
   2 Avg Year: 2008 english languages dictionary errors morphological chinese lexical rules corpora pos
     21 Avg Year: 2009 entity sense relations relation semantic entities wordnet senses lexical patterns
     22 Avg Year: 2010 translation english source alignment phrase target languages sentences systems parallel
   3 Avg Year: 2008 models probability training algorithm probabilities search parameters sequence segmentation gram
     31 Avg Year: 2011 features feature training performance learning classifier classification accuracy test class
     32 Avg Year: 2015 models network vector emb