In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, sample_each, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train_tmp, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
if len(instances_train_tmp) > config.size:
    instances_train = np.random.choice(instances_train_tmp, config.size, replace=False)
else:
    instances_train = instances_train_tmp

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(31943, 1035)

# run

## initialize log

In [7]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [8]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 20000 30000 0 0 

## run

In [None]:
while epoch < config.n_epochs:
    time_start = time.time()
    sample_each(train_docs, topic_root, train=True)
    time_log = float(time.time() - time_start)
    
    sample_each(dev_docs, topic_root, train=False)
    sample_each(test_docs, topic_root, train=False)
    
#     ppl_train = get_perplexity(train_docs, topic_root)
    ppl_train=0
    ppl_dev = get_perplexity(dev_docs, topic_root)
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, ppl_dev, ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,62.808908,0,0,0,466.850415,458.731271,0.05,0.37,0.49,0.8,0.74
1,67.381428,1,0,0,461.414072,452.161808,0.05,0.43,0.51,0.73,0.65
2,73.186637,2,0,0,451.947371,443.510666,0.05,0.45,0.53,0.67,0.58
3,74.043717,3,0,0,432.369924,422.854989,0.05,0.47,0.54,0.61,0.53
4,77.117927,4,0,0,410.778835,401.867535,0.05,0.47,0.58,0.58,0.45
5,78.179409,5,0,0,389.556786,376.962946,0.06,0.47,0.59,0.67,0.52
6,75.579591,6,0,0,376.084767,364.519701,0.06,0.42,0.58,0.57,0.42
7,72.640869,7,0,0,367.069761,354.535213,0.06,0.46,0.6,0.51,0.35
8,79.506375,8,0,0,357.347831,345.405122,0.07,0.49,0.61,0.61,0.43
9,78.372508,9,0,0,347.331579,334.999707,0.07,0.51,0.62,0.54,0.38


 0 31943 299536.0 ! nice bought love price quality perfect recommend 'm -
   0-1 7494 54223.0 ! cover color keyboard love mac bottom pro apple hard
     0-1-1 148 266.0 bottom top part started snap minimal mini break useless corners
     0-1-2 7346 8821.0 bottom speck cracked corners piece corner part crack mcover finish
   0-2 1930 16008.0 carry pockets handle back pack comfortable books work compartments shoulder
     0-2-1 1731 2293.0 camera plenty compartment pack lenses bottle carry equipment room gear
     0-2-2 199 158.0 normal rip eventually broken compact seam fabric manufacturer tall falls
   0-3 5282 42447.0 pocket room power & sleeve mouse ; charger cord carry
     0-3-1 5060 6188.0 compartment camera phone plenty pens pockets cell accessories cables chargers
     0-3-2 222 219.0 ; & stiff padding falls classy free heat machine tsa
   0-4 6530 44723.0 sleeve & ; air protection pro cover inside inch zipper
     0-4-1 1861 2592.0 smell strong odor chemical smells bad days rev