In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(31943, 1035)

# run

## initialize log

In [6]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [7]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 20000 30000 0 0 

## run

In [None]:
time_start = time.time()
while epoch < config.n_epochs:
    sample(train_docs, dev_docs, test_docs, topic_root)
    ppl_train = get_perplexity(train_docs, topic_root)
    ppl_dev = get_perplexity(dev_docs, topic_root)
    
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    time_log = int(time.time() - time_start)
    time_start = time.time()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, '%.0f'%ppl_dev, '%.0f'%ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series    
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,PPL,1,2,3,CHILD
0,199,0,0,367,349,348,0.09,0.5,0.64,0.47,0.34
1,242,1,0,328,320,322,0.1,0.53,0.66,0.44,0.29
2,266,2,0,317,315,312,0.1,0.52,0.67,0.46,0.29
3,286,3,0,306,305,304,0.1,0.54,0.65,0.44,0.29
4,339,4,0,291,293,288,0.11,0.56,0.66,0.4,0.28
5,360,5,0,288,292,286,0.11,0.55,0.66,0.35,0.25
6,371,6,0,290,296,286,0.12,0.56,0.65,0.35,0.25
7,416,7,0,286,287,287,0.12,0.57,0.64,0.35,0.26
8,442,8,0,280,281,279,0.12,0.57,0.63,0.34,0.25
9,458,9,0,279,282,279,0.12,0.58,0.62,0.36,0.28


 0 31943 272017.0 ! nice bought price quality love made 'm put recommend
   0-1 1357 9752.0 ! love color cover perfect mac recommend perfectly pro easy
     0-1-2 185 132.0 wait sleek card doubt acer put ! print couple aspire
     0-1-4 535 334.0 compliments green picture glove sony clear cool quick retina professional
     0-1-5 637 618.0 keyboard cover mcover highly scratches smell blue wireless condition protector
   0-2 714 6344.0 travel carry handle security easy work airport trip compartments business
     0-2-1 123 100.0 deep paper class business fix walk entire dimensions easier week
     0-2-2 523 496.0 paperwork trips short luggage wheels worked airports system everyday open
     0-2-3 68 70.0 difference point storage doesnt main couple convenient protect extremely pens
   0-3 700 6375.0 pocket netbook power mouse cord room small charger perfect sleeve
     0-3-2 655 630.0 dvd player inside portable plenty adapter drive ipad chargers size
     0-3-3 41 31.0 gray seat width or

0 10000 20000 30000 0 0 