In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, sample_each, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train_tmp, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
if len(instances_train_tmp) > config.size:
    instances_train = np.random.choice(instances_train_tmp, config.size, replace=False)
else:
    instances_train = instances_train_tmp

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(16000, 1035)

# run

## initialize log

In [7]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [8]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 0 0 

## run

In [None]:
while epoch < config.n_epochs:
    time_start = time.time()
    sample_each(train_docs, topic_root, train=True)
    time_log = float(time.time() - time_start)
    
    sample_each(dev_docs, topic_root, train=False)
    sample_each(test_docs, topic_root, train=False)
    
#     ppl_train = get_perplexity(train_docs, topic_root)
    ppl_train=0
    ppl_dev = get_perplexity(dev_docs, topic_root)
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, ppl_dev, ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,27.083441,0,0,0,464.759506,462.341135,0.05,0.34,0.48,0.84,0.76
1,26.972875,1,0,0,461.62692,458.39376,0.05,0.35,0.49,0.84,0.77
2,26.541026,2,0,0,458.313892,454.67946,0.05,0.36,0.5,0.82,0.76
3,26.835466,3,0,0,454.416582,449.97796,0.05,0.39,0.51,0.8,0.74
4,27.210857,4,0,0,451.349807,445.589286,0.05,0.4,0.52,0.76,0.72
5,27.530025,5,0,0,446.693861,442.760352,0.05,0.41,0.52,0.75,0.72
6,28.854706,6,0,0,441.46534,435.70628,0.05,0.44,0.53,0.72,0.69
7,28.808154,7,0,0,427.744377,419.566592,0.05,0.47,0.54,0.68,0.64
8,31.72002,8,0,0,416.807316,408.842215,0.05,0.49,0.55,0.63,0.57
9,29.332566,9,0,0,398.175745,389.695893,0.05,0.47,0.57,0.63,0.48


 0 16000 151421.0 ! nice bought love price quality perfect recommend made 'm
   0-1 3502 27114.0 cover ! color keyboard bottom mac apple love pro top
     0-1-2 3502 3131.0 mac purple scratch skin true scratches plastic dirty rubber protect
   0-2 1359 13844.0 carry pockets camera pack compartment straps shoulder comfortable compartments strap
     0-2-1 1359 1324.0 forward section zipper trips lens attached big compartment shoulders holder
   0-3 1879 16974.0 & ; pocket power mouse sleeve room cord netbook charger
     0-3-1 1879 1700.0 notebook chromebook mini touch port dvd inexpensive bulk surface storage
   0-4 1209 9401.0 months broke zipper strap handle started shoulder weeks year straps
     0-4-1 1209 1156.0 company lost years strong held files happen tear correctly month
   0-5 2751 18324.0 sleeve protection air pro inside inch smell zipper protect neoprene
     0-5-1 2751 2427.0 thick sleeve -- zippers tight star mind ; protective sleeves
   0-6 5300 33201.0 ! carry pockets 