In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, sample_each, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train_tmp, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
if len(instances_train_tmp) > config.size:
    instances_train = np.random.choice(instances_train_tmp, config.size, replace=False)
else:
    instances_train = instances_train_tmp

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(16000, 1035)

# run

## initialize log

In [7]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [8]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 0 0 

## run

In [None]:
while epoch < config.n_epochs:
    time_start = time.time()
    sample_each(train_docs, topic_root, train=True)
    time_log = float(time.time() - time_start)
    
    sample_each(dev_docs, topic_root, train=False)
    sample_each(test_docs, topic_root, train=False)
    
#     ppl_train = get_perplexity(train_docs, topic_root)
    ppl_train=0
    ppl_dev = get_perplexity(dev_docs, topic_root)
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, ppl_dev, ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,31.76331,0,0,0,468.740524,464.771365,0.08,0.09,0.45,0.6,0.0
1,25.434818,1,0,0,463.981695,459.972544,0.08,0.1,0.5,0.56,0.0
2,26.770624,2,0,0,463.436801,456.615157,0.08,0.31,0.52,0.77,0.75
3,33.217985,3,0,0,457.890237,451.517807,0.09,0.39,0.53,0.83,0.81
4,33.341108,4,0,0,448.125425,441.27832,0.09,0.34,0.54,0.72,0.69
5,35.522362,5,0,0,439.980861,432.56531,0.1,0.36,0.55,0.67,0.62
6,30.512976,6,0,0,429.159737,420.562958,0.11,0.38,0.58,0.61,0.55
7,34.596855,7,0,0,416.629805,408.689451,0.12,0.39,0.59,0.58,0.52
8,31.835475,8,0,0,403.828591,395.128497,0.14,0.37,0.6,0.53,0.46
9,32.193182,9,0,0,391.170559,380.748078,0.15,0.28,0.59,0.48,0.41


 0 16000 152193.0 ! nice ; bought & price love quality recommend perfect
   0-1 3856 27897.0 ! cover color keyboard love mac bottom pro apple hard
     0-1-2 31 19.0 pc fell offer organized broke swiss papers cheaply compartment weeks
     0-1-3 3825 3185.0 protector dirty pieces super black cracked hot buying ports typing
   0-2 3024 26366.0 carry pockets room comfortable plenty shoulder compartment pocket pack compartments
     0-2-1 8 2.0 worked wallet finally flexible flaw flat flash flap fix fitting
     0-2-2 3016 3417.0 camera lens lenses equipment canon tripod gear batteries flash body
   0-3 3016 23526.0 sleeve pocket inside power inch room netbook protection mouse zipper
     0-3-1 3016 2662.0 logic zippers loose compartment nicely barely slightly ipad basic vaio
   0-4 2586 18420.0 strap handle months zipper broke shoulder straps years ... year
     0-4-1 2534 2367.0 ... gift 've books higher compartment company entire pain roller
     0-4-2 52 63.0 description strap end ful