In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, sample_each, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train_tmp, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
if len(instances_train_tmp) > config.size:
    instances_train = np.random.choice(instances_train_tmp, config.size, replace=False)
else:
    instances_train = instances_train_tmp

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(8000, 1035)

# run

## initialize log

In [7]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [8]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 0 0 

## run

In [9]:
while epoch < config.n_epochs:
    time_start = time.time()
    sample_each(train_docs, topic_root, train=True)
    time_log = float(time.time() - time_start)
    
    sample_each(dev_docs, topic_root, train=False)
    sample_each(test_docs, topic_root, train=False)
    
#     ppl_train = get_perplexity(train_docs, topic_root)
    ppl_train=0
    ppl_dev = get_perplexity(dev_docs, topic_root)
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, ppl_dev, ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,14.114480,0,0,0,467.514593,462.777199,0.05,0.24,0.47,0.73,0.00
1,13.732121,1,0,0,463.018875,458.481271,0.05,0.26,0.51,0.70,0.00
2,15.588953,2,0,0,459.324331,452.719528,0.05,0.40,0.52,0.85,0.83
3,15.223833,3,0,0,456.796927,450.221118,0.05,0.41,0.52,0.84,0.82
4,15.619980,4,0,0,453.925747,446.784091,0.05,0.41,0.52,0.84,0.82
5,15.027381,5,0,0,450.553226,445.293741,0.05,0.42,0.53,0.83,0.81
6,16.093080,6,0,0,443.973233,439.227389,0.06,0.42,0.54,0.82,0.79
7,15.232101,7,0,0,438.381552,431.743910,0.06,0.43,0.55,0.81,0.77
8,16.199103,8,0,0,432.013519,427.885273,0.06,0.45,0.55,0.78,0.74
9,17.844921,9,0,0,425.166255,420.737426,0.06,0.48,0.55,0.75,0.69


 0 8000 79247.0 ! nice bought price love quality perfect recommend 'm made
   0-1 1750 14743.0 carry pockets room shoulder comfortable pocket camera compartments plenty strap
     0-1-2 1750 1509.0 luggage camera items larger travel things wear chargers tablet netbook
   0-2 2364 16160.0 ! cover color keyboard mac love bottom apple scratches pro
     0-2-1 2364 1778.0 edges easy glad days protector months keys complain fine cracked
   0-3 1686 11001.0 & ; strap zipper handle broke months shoulder return straps
     0-3-1 1686 1527.0 bags zippers ended swissgear professional daughter pull zipper shell cheap
   0-4 2096 14251.0 sleeve pocket inch protection power netbook inside mouse charger room
     0-4-1 2096 1750.0 cords - charger protected tight keyboard expect usb external pattern
   0-5 104 855.0 cards memory sd card slots video cloth kind pictures tab
     0-5-1 104 110.0 slots fact future half flash heavy devices ordering organized grab
