In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import sys
import argparse
import subprocess
import pdb
import time
import random
import _pickle as cPickle
import matplotlib.pyplot as plt
import glob

%matplotlib inline

import numpy as np
import pandas as pd
import tensorflow as tf

from data_structure import get_batches
from gsm import GaussianSoftmaxModel
from rsm import RecurrentStickbreakingModel
from evaluation import validate, print_flat_topic_sample
from configure import get_config

# load data & set config

In [3]:
config = get_config(nb_name)

In [4]:
os.environ["CUDA_VISIBLE_DEVICES"] = config.gpu
np.random.seed(config.seed)
random.seed(config.seed)

In [5]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))
train_batches = get_batches(instances_train, config.batch_size)
dev_batches = get_batches(instances_dev, config.batch_size)
test_batches = get_batches(instances_test, config.batch_size)
config.dim_bow = len(bow_idxs)

In [6]:
def debug_shape(variables, model):
    sample_batch = test_batches[0][1]
    feed_dict = model.get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)
    for _variable, variable in zip(_variables, variables):
        if hasattr(variable, 'name'):
            print(variable.name, ':', _variable.shape)
        else:
            print(_variable.shape)

def debug_value(variables, model, return_value=False):
    sample_batch = test_batches[0][1]
    feed_dict = model.get_feed_dict(sample_batch)
    _variables = sess.run(variables, feed_dict=feed_dict)

    return _variables

# run

## initialize log

In [7]:
checkpoint = []
losses_train = []
ppls_train = []
ppl_min = np.inf
epoch = 0
train_batches = get_batches(instances_train, config.batch_size, iterator=True)

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','','','','','VALID:','','','','','TEST:',''],
                            ['Time','Ep','Ct','LOSS','PPL','NLL','KL','REG','LOSS','PPL','NLL','KL','REG','LOSS','PPL']]))))

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

def update_checkpoint(config, checkpoint, global_step):
    checkpoint.append(config.path_model + '-%i' % global_step)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0) + '.*'
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize model

In [8]:
if 'sess' in globals(): sess.close()
if config.model == 'gsm':
    Model = GaussianSoftmaxModel
elif config.model == 'rsm':
    Model = RecurrentStickbreakingModel
model = Model(config)    
sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
sess.run(tf.global_variables_initializer())
saver = tf.train.Saver(max_to_keep=config.max_to_keep)
update_tree_flg = False

## train & validate model

In [9]:
time_start = time.time()
while epoch < config.n_epochs:
    # train
    for ct, batch in train_batches:
        feed_dict = model.get_feed_dict(batch)
        _, loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch, ppls_batch, global_step_log = \
        sess.run([model.opt, model.loss, model.topic_loss_recon, model.topic_loss_kl, model.topic_loss_reg, model.topic_ppls, tf.train.get_global_step()], feed_dict = feed_dict)

        losses_train += [[loss_batch, topic_loss_recon_batch, topic_loss_kl_batch, topic_loss_reg_batch]]
        ppls_train += list(ppls_batch)

        if global_step_log % config.log_period == 0:
            # validate
            loss_train, topic_loss_recon_train, topic_loss_kl_train, topic_loss_reg_train = np.mean(losses_train, 0)
            ppl_train = np.exp(np.mean(ppls_train))
            loss_dev, topic_loss_recon_dev, topic_loss_kl_dev, topic_loss_reg_dev, ppl_dev, probs_topic_dev = validate(sess, dev_batches, model)

            # test
            if ppl_dev < ppl_min:
                ppl_min = ppl_dev
                loss_test, _, _, _, ppl_test, _ = validate(sess, test_batches, model)
                saver.save(sess, config.path_model, global_step=global_step_log)
                cPickle.dump(config, open(config.path_config % global_step_log, 'wb'))
                update_checkpoint(config, checkpoint, global_step_log)
            
            # visualize topic
            topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :config.n_freq]
            topics_freq_idxs = bow_idxs[topics_freq_indices]
            topics_freq_tokens = [[idx_to_word[idx] for idx in topic_freq_idxs] for topic_freq_idxs in topics_freq_idxs]
            
            # log
            clear_output()
            time_log = int(time.time() - time_start)
            log_series = pd.Series([time_log, epoch, ct, \
                    '%.2f'%loss_train, '%.0f'%ppl_train, '%.2f'%topic_loss_recon_train, '%.2f'%topic_loss_kl_train, '%.2f'%topic_loss_reg_train, \
                    '%.2f'%loss_dev, '%.0f'%ppl_dev, '%.2f'%topic_loss_recon_dev, '%.2f'%topic_loss_kl_dev, '%.2f'%topic_loss_reg_dev, \
                    '%.2f'%loss_test, '%.0f'%ppl_test],
                    index=log_df.columns)
            log_df.loc[global_step_log] = log_series
            display(log_df)
            cPickle.dump(log_df, open(os.path.join(config.path_log), 'wb'))
            print_flat_topic_sample(sess, model, topics_freq_tokens=topics_freq_tokens)

            # update tree
            if not config.static:
                config.n_topic, update_flg, diff = model.update_topic(sess, dev_batches)
                print('Diff: %.6f' % diff)
                if update_flg:
                    print('Update to %i' % config.n_topic)
                    name_variables = {tensor.name: variable for tensor, variable in zip(tf.global_variables(), sess.run(tf.global_variables()))} # store paremeters
                    if 'sess' in globals(): sess.close()
                    model = Model(config)
                    sess = tf.Session(config=tf.ConfigProto(intra_op_parallelism_threads=1, inter_op_parallelism_threads=1))
                    name_tensors = {tensor.name: tensor for tensor in tf.global_variables()}
                    sess.run([name_tensors[name].assign(variable) for name, variable in name_variables.items()]) # restore parameters
                    saver = tf.train.Saver(max_to_keep=1)
                
            time_start = time.time()

    train_batches = get_batches(instances_train, config.batch_size, iterator=True)
    epoch += 1

display(log_df)
print_flat_topic_sample(sess, model, topics_freq_tokens=topics_freq_tokens)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL
5000,41,10,9,111.76,595,110.95,0.80,0.01,104.37,506,103.54,0.83,0.00,102.53,502
10000,40,20,19,111.46,623,110.49,0.96,0.01,104.11,499,103.07,1.04,0.00,102.28,495
15000,43,30,29,111.29,632,110.21,1.07,0.01,104.29,504,103.12,1.17,0.00,102.28,495
20000,40,40,39,111.18,634,110.02,1.15,0.00,104.10,500,102.85,1.25,0.00,102.28,495
25000,40,50,49,111.09,632,109.88,1.21,0.00,104.04,497,102.76,1.28,0.00,101.99,486
30000,37,60,59,111.03,629,109.77,1.25,0.00,104.04,499,102.66,1.38,0.00,101.99,486
35000,36,70,69,110.97,625,109.68,1.29,0.00,104.15,503,102.72,1.42,0.00,101.99,486
40000,36,80,79,110.93,621,109.59,1.33,0.00,103.99,496,102.54,1.45,0.00,101.79,480
45000,36,90,89,110.89,617,109.52,1.36,0.00,103.82,489,102.35,1.47,0.00,101.94,485
50000,36,100,99,110.85,613,109.46,1.39,0.00,103.88,490,102.37,1.51,0.00,101.94,485


0 carry pockets room pocket small strap ! nice perfect hold
1 ! cover color love keyboard mac pro perfectly hard recommend
2 & ; sleeve protection inside air inch pro nice protect
3 bottom months top 've year broke back bought weeks years
4 $ smell amazon item ... return received ordered days ?
5 zipper flap zippers velcro zip close closed open - design
6 reviews read review reading decided based thought mentioned positive found
7 quality close bad thing pretty price ... bought ca stars
8 speck cheap buy plastic rubber cases pretty thin disappointed purchased
9 return poor broke quality returned refund warranty contacted targus disappointed
10 quality made problem 'm time day purchased back high seams
Diff: -0.000006


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL
5000,41,10,9,111.76,595,110.95,0.80,0.01,104.37,506,103.54,0.83,0.00,102.53,502
10000,40,20,19,111.46,623,110.49,0.96,0.01,104.11,499,103.07,1.04,0.00,102.28,495
15000,43,30,29,111.29,632,110.21,1.07,0.01,104.29,504,103.12,1.17,0.00,102.28,495
20000,40,40,39,111.18,634,110.02,1.15,0.00,104.10,500,102.85,1.25,0.00,102.28,495
25000,40,50,49,111.09,632,109.88,1.21,0.00,104.04,497,102.76,1.28,0.00,101.99,486
30000,37,60,59,111.03,629,109.77,1.25,0.00,104.04,499,102.66,1.38,0.00,101.99,486
35000,36,70,69,110.97,625,109.68,1.29,0.00,104.15,503,102.72,1.42,0.00,101.99,486
40000,36,80,79,110.93,621,109.59,1.33,0.00,103.99,496,102.54,1.45,0.00,101.79,480
45000,36,90,89,110.89,617,109.52,1.36,0.00,103.82,489,102.35,1.47,0.00,101.94,485
50000,36,100,99,110.85,613,109.46,1.39,0.00,103.88,490,102.37,1.51,0.00,101.94,485


0 carry pockets room pocket small strap ! nice perfect hold
1 ! cover color love keyboard mac pro perfectly hard recommend
2 & ; sleeve protection inside air inch pro nice protect
3 bottom months top 've year broke back bought weeks years
4 $ smell amazon item ... return received ordered days ?
5 zipper flap zippers velcro zip close closed open - design
6 reviews read review reading decided based thought mentioned positive found
7 quality close bad thing pretty price ... bought ca stars
8 speck cheap buy plastic rubber cases pretty thin disappointed purchased
9 return poor broke quality returned refund warranty contacted targus disappointed
10 quality made problem 'm time day purchased back high seams
