In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import sys
import argparse
import subprocess
import pdb
import time
import random
import _pickle as cPickle
import matplotlib.pyplot as plt
import glob

%matplotlib inline

import numpy as np
import pandas as pd
import tensorflow as tf

from data_structure import get_batches
from hntm import HierarchicalNeuralTopicModel
from tree import get_descendant_idxs
from evaluation import validate, get_hierarchical_affinity, get_topic_specialization, print_topic_sample
from configure import get_config

# load data & set config

In [3]:
config = get_config(nb_name)

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu
np.random.seed(config.seed)
random.seed(config.seed)

In [5]:
instances_train_tmp, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [6]:
if len(instances_train_tmp) > config.size:
    instances_train = np.random.choice(instances_train_tmp, config.size, replace=False)
else:
    instances_train = instances_train_tmp

In [7]:
train_batches = get_batches(instances_train, config.batch_size)
dev_batches = get_batches(instances_dev, config.batch_size)
test_batches = get_batches(instances_test, config.batch_size)
config.dim_bow = len(bow_idxs)

In [8]:
def debug_shape(variables, model):
    sample_batch = test_batches[0][1]
    feed_dict = model.get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)
    for _variable, variable in zip(_variables, variables):
        if hasattr(variable, 'name'):
            print(variable.name, ':', _variable.shape)
        else:
            print(_variable.shape)

def debug_value(variables, model, return_value=False):
    sample_batch = test_batches[0][1]
    feed_dict = model.get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)

    return _variables

# run

## initialize log

In [9]:
checkpoint = []
losses_train = []
ppls_train = []
ppl_min = np.inf
epoch = 0
train_batches = get_batches(instances_train, config.batch_size, iterator=True)

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','','','','','VALID:','','','','','TEST:','', 'SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','LOSS','PPL','NLL','KL','REG','LOSS','PPL','NLL','KL','REG','LOSS','PPL', '1', '2', '3', 'CHILD', 'OTHER']]))))

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

def update_checkpoint(config, checkpoint, global_step):
    checkpoint.append(config.path_model + '-%i' % global_step)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0) + '.*'
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize model

In [10]:
if 'sess' in globals(): sess.close()
model = HierarchicalNeuralTopicModel(config)
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=config.max_to_keep)
update_tree_flg = False

## train & validate model

In [11]:
time_start = time.time()
while epoch < config.n_epochs:
    # train
    time_start = time.time()
    for ct, batch in train_batches:
        feed_dict = model.get_feed_dict(batch)
        _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch, ppls_batch, global_step_log = \
        sess.run([model.opt, model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_reg, model.topic_ppls, tf.train.get_global_step()], feed_dict = feed_dict)

        losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch]]
        ppls_train += list(ppls_batch)

    if epoch % 10 == 0:
        time_log = float(time.time() - time_start)
        
        # validate
        loss_train, topic_loss_recon_train, topic_loss_kl_train, topic_loss_reg_train = np.mean(losses_train, 0)
        ppl_train = np.exp(np.mean(ppls_train))
        loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)

        # test
        if ppl_dev < ppl_min:
            ppl_min = ppl_dev
            loss_test, _, _, _, ppl_test, _ = validate(sess, test_batches, model)
            saver.save(sess, config.path_model, global_step=global_step_log)
            cPickle.dump(config, open(config.path_config % global_step_log, 'wb'))
            update_checkpoint(config, checkpoint, global_step_log)

        # visualize topic
        topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
        topics_freq_idxs = bow_idxs[topics_freq_indices]
        topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
        topic_prob_topic = {topic_idx: prob_topic for topic_idx, prob_topic in zip(model.topic_idxs, probs_topic_dev)}
        descendant_idxs = {parent_idx: get_descendant_idxs(model, parent_idx) for parent_idx in model.topic_idxs}
        recur_prob_topic = {parent_idx: np.sum([topic_prob_topic[child_idx] for child_idx in recur_child_idxs]) for parent_idx, recur_child_idxs in descendant_idxs.items()}

        depth_specs = get_topic_specialization(sess, model, instances_test)
        hierarchical_affinities = get_hierarchical_affinity(sess, model)

        # log
        clear_output()
        log_series = pd.Series([time_log, epoch, ct, \
                '%.2f'%loss_train, '%.0f'%ppl_train, '%.2f'%topic_loss_recon_train, '%.2f'%topic_loss_kl_train, '%.2f'%topic_loss_reg_train, \
                '%.2f'%loss_dev, ppl_dev, '%.2f'%topic_loss_recon_dev, '%.2f'%topic_loss_kl_dev, '%.2f'%topic_loss_reg_dev, \
                '%.2f'%loss_test, ppl_test, \
                '%.2f'%depth_specs[1], '%.2f'%depth_specs[2], '%.2f'%depth_specs[3], \
                '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
                index=log_df.columns)
        log_df.loc[global_step_log] = log_series
        display(log_df)
        cPickle.dump(log_df, open(os.path.join(config.path_log), 'wb'))
        print_topic_sample(sess, model, topic_prob_topic=topic_prob_topic, recur_prob_topic=recur_prob_topic, topic_freq_tokens=topic_freq_tokens)

        # update tree
        if not config.static:
            config.tree_idxs, update_tree_flg = model.update_tree(topic_prob_topic, recur_prob_topic)
            if update_tree_flg:
                print(config.tree_idxs)
                name_variables = {tensor.name: variable for tensor, variable in zip(tf.global_variables(), sess.run(tf.global_variables()))} # store paremeters
                if 'sess' in globals(): sess.close()
                model = HierarchicalNeuralTopicModel(config)
                sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
                name_tensors = {tensor.name: tensor for tensor in tf.global_variables()}
                sess.run([name_tensors[name].assign(variable) for name, variable in name_variables.items()]) # restore parameters
                saver = tf.train.Saver(max_to_keep=1)

    train_batches = get_batches(instances_train, config.batch_size, iterator=True)
    epoch += 1

loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)
topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
topics_freq_idxs = bow_idxs[topics_freq_indices]
topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
topic_prob_topic = {topic_idx: prob_topic for topic_idx, prob_topic in zip(model.topic_idxs, probs_topic_dev)}
descendant_idxs = {parent_idx: get_descendant_idxs(model, parent_idx) for parent_idx in model.topic_idxs}
recur_prob_topic = {parent_idx: np.sum([topic_prob_topic[child_idx] for child_idx in recur_child_idxs]) for parent_idx, recur_child_idxs in descendant_idxs.items()}
display(log_df)
print_topic_sample(sess, model, topic_prob_topic=topic_prob_topic, recur_prob_topic=recur_prob_topic, topic_freq_tokens=topic_freq_tokens)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0,SPEC:,Unnamed: 17_level_0,Unnamed: 18_level_0,HIER:,Unnamed: 20_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL,1,2,3,CHILD,OTHER
125,4.060500,0,124,114.92,626,114.30,0.57,0.05,105.81,545.567444,105.42,0.38,0.00,103.96,544.425293,0.13,0.13,0.15,0.92,0.82
1375,1.097635,10,124,112.49,549,111.75,0.72,0.01,104.74,515.411743,103.92,0.82,0.00,102.85,513.129700,0.25,0.37,0.42,0.41,0.33
2625,1.658781,20,124,111.92,533,110.93,0.97,0.02,104.35,505.762207,103.19,1.16,0.01,102.46,501.423523,0.27,0.44,0.44,0.55,0.37
3875,1.447974,30,124,111.60,525,110.45,1.13,0.01,104.24,502.102783,102.92,1.32,0.00,102.33,498.876221,0.28,0.53,0.53,0.42,0.17
5125,1.417565,40,124,111.38,519,110.12,1.25,0.01,104.10,498.696655,102.65,1.44,0.00,102.21,495.193420,0.28,0.50,0.44,0.40,0.24
6375,1.479686,50,124,111.20,514,109.84,1.35,0.01,103.98,494.747101,102.42,1.56,0.00,102.06,490.224670,0.29,0.53,0.49,0.34,0.17
7625,1.596437,60,124,111.05,510,109.61,1.43,0.01,103.87,491.800385,102.21,1.66,0.00,101.95,486.694153,0.29,0.54,0.46,0.32,0.19
8875,1.508587,70,124,110.92,507,109.41,1.50,0.01,103.80,489.862488,102.07,1.73,0.00,101.90,484.744476,0.30,0.55,0.48,0.30,0.18
10125,1.400743,80,124,110.81,504,109.24,1.56,0.01,103.76,488.703186,101.98,1.78,0.00,101.86,483.624725,0.30,0.56,0.49,0.29,0.17
11375,1.888456,90,124,110.72,502,109.10,1.61,0.01,103.74,487.971313,101.92,1.82,0.00,101.84,482.780273,0.30,0.57,0.49,0.28,0.17


0 R: 1.000 P: 0.369 carry pockets strap room back quality nice - space work
   1 R: 0.280 P: 0.123 sleeve protection inside inch protect padding zipper air pro snug
     11 R: 0.074 P: 0.074 pocket mouse power room charger cord netbook small perfect front
     14 R: 0.024 P: 0.024 ipad price netbook amazon perfectly chromebook perfect sleeve found samsung
     12 R: 0.060 P: 0.060 ! love perfect recommend color mac ... buy perfectly price
   4 R: 0.089 P: 0.051 months zipper broke year years 've strap started bought handle
     42 R: 0.038 P: 0.038 & ; pro perfectly retina sleeve ! bought price air
   5 R: 0.096 P: 0.053 item return smell received reviews ! disappointed amazon 'm back
     52 R: 0.043 P: 0.043 ! color ordered purchase time received happy shipping arrived price
   3 R: 0.037 P: 0.013 leather nice bottom pretty back bit 've ... issue top
     31 R: 0.011 P: 0.011 color nice black easy cover logo air pro apple feel
     32 R: 0.012 P: 0.012 color love pink picture ! purpl

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0,SPEC:,Unnamed: 17_level_0,Unnamed: 18_level_0,HIER:,Unnamed: 20_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL,1,2,3,CHILD,OTHER
125,4.060500,0,124,114.92,626,114.30,0.57,0.05,105.81,545.567444,105.42,0.38,0.00,103.96,544.425293,0.13,0.13,0.15,0.92,0.82
1375,1.097635,10,124,112.49,549,111.75,0.72,0.01,104.74,515.411743,103.92,0.82,0.00,102.85,513.129700,0.25,0.37,0.42,0.41,0.33
2625,1.658781,20,124,111.92,533,110.93,0.97,0.02,104.35,505.762207,103.19,1.16,0.01,102.46,501.423523,0.27,0.44,0.44,0.55,0.37
3875,1.447974,30,124,111.60,525,110.45,1.13,0.01,104.24,502.102783,102.92,1.32,0.00,102.33,498.876221,0.28,0.53,0.53,0.42,0.17
5125,1.417565,40,124,111.38,519,110.12,1.25,0.01,104.10,498.696655,102.65,1.44,0.00,102.21,495.193420,0.28,0.50,0.44,0.40,0.24
6375,1.479686,50,124,111.20,514,109.84,1.35,0.01,103.98,494.747101,102.42,1.56,0.00,102.06,490.224670,0.29,0.53,0.49,0.34,0.17
7625,1.596437,60,124,111.05,510,109.61,1.43,0.01,103.87,491.800385,102.21,1.66,0.00,101.95,486.694153,0.29,0.54,0.46,0.32,0.19
8875,1.508587,70,124,110.92,507,109.41,1.50,0.01,103.80,489.862488,102.07,1.73,0.00,101.90,484.744476,0.30,0.55,0.48,0.30,0.18
10125,1.400743,80,124,110.81,504,109.24,1.56,0.01,103.76,488.703186,101.98,1.78,0.00,101.86,483.624725,0.30,0.56,0.49,0.29,0.17
11375,1.888456,90,124,110.72,502,109.10,1.61,0.01,103.74,487.971313,101.92,1.82,0.00,101.84,482.780273,0.30,0.57,0.49,0.28,0.17


0 R: 1.000 P: 0.390 carry pockets strap room back quality space - nice work
   1 R: 0.272 P: 0.109 sleeve protection inside inch protect zipper padding air pro snug
     11 R: 0.069 P: 0.069 pocket mouse power room charger cord netbook small perfect front
     12 R: 0.031 P: 0.031 ipad price netbook amazon perfectly chromebook perfect sleeve found recommend
     13 R: 0.062 P: 0.062 ! love perfect recommend color mac perfectly buy ... price
   4 R: 0.089 P: 0.052 months zipper broke year years 've started bought handle strap
     41 R: 0.037 P: 0.037 & ; pro perfectly sleeve ! retina bought air price
   5 R: 0.093 P: 0.052 item return smell reviews received ! disappointed 'm amazon back
     51 R: 0.041 P: 0.041 ! color ordered time purchase received happy shipping arrived price
   6 R: 0.028 P: 0.012 leather nice bottom back pretty issue top 've bit ...
     61 R: 0.008 P: 0.008 color black nice easy logo cover air apple pro light
     62 R: 0.008 P: 0.008 color love pink picture ! pu