In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, sample_each, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train_tmp, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
if len(instances_train_tmp) > config.size:
    instances_train = np.random.choice(instances_train_tmp, config.size, replace=False)
else:
    instances_train = instances_train_tmp

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(31943, 1035)

# run

## initialize log

In [7]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [8]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 20000 30000 0 0 

## run

In [None]:
while epoch < config.n_epochs:
    time_start = time.time()
    sample_each(train_docs, topic_root, train=True)
    time_log = float(time.time() - time_start)
    
    sample_each(dev_docs, topic_root, train=False)
    sample_each(test_docs, topic_root, train=False)
    
#     ppl_train = get_perplexity(train_docs, topic_root)
    ppl_train=0
    ppl_dev = get_perplexity(dev_docs, topic_root)
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, ppl_dev, ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,64.42286,0,0,0,462.274369,459.049237,0.07,0.39,0.5,0.85,0.8
1,67.418818,1,0,0,456.995343,453.780588,0.08,0.42,0.52,0.8,0.74
2,68.242956,2,0,0,447.621214,445.118812,0.08,0.45,0.54,0.74,0.68
3,68.304093,3,0,0,427.833432,422.794769,0.09,0.47,0.55,0.69,0.62
4,70.976106,4,0,0,409.561489,404.191544,0.09,0.48,0.57,0.65,0.55
5,73.107332,5,0,0,381.758598,375.817172,0.11,0.44,0.59,0.57,0.42
6,72.700796,6,0,0,366.365535,359.759718,0.12,0.41,0.59,0.66,0.46
7,76.889822,7,0,0,362.807127,354.590459,0.11,0.39,0.6,0.63,0.42
8,72.683087,8,0,0,355.392937,346.34462,0.11,0.39,0.61,0.56,0.38
9,69.449128,9,0,0,342.718361,334.453574,0.1,0.41,0.61,0.5,0.33


 0 31943 303736.0 ! nice bought ; & price quality love sleeve perfect
   0-1 4365 31134.0 ! cover color love keyboard mac perfectly pro apple pink
     0-1-1 3119 2345.0 cover keyboard bottom keys screen mac beautiful deal board stays
     0-1-2 1245 989.0 service gift pleased orange shipping person delivery told colors saved
     0-1-3 1 0.0 zips fast floor flimsy flexible flaw flat flash flap fix
   0-2 4104 30492.0 pocket carry sleeve room power mouse ipad charger cord netbook
     0-2-1 4099 3536.0 camera inch extra room protection pens compartment plenty accessories pockets
     0-2-2 5 8.0 materials protected accessible overhead corners feels rating true flap flash
   0-3 5365 40716.0 carry handle strap straps pockets back shoulder work pack years
     0-3-1 3384 3922.0 camera gear pack lenses equipment carry clothes plenty lens room
     0-3-2 1981 2803.0 warranty months broke service customer started replacement seam contacted fell
   0-4 7983 54436.0 cover color pro air keyboa