In [1]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [2]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

# load config & data 

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [5]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(9006, 1995)

# run

## initialize log

In [6]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [7]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 0 0 

## run

In [8]:
time_start = time.time()
while epoch < config.n_epochs:
    sample(train_docs, dev_docs, test_docs, topic_root)
    ppl_train = get_perplexity(train_docs, topic_root)
    ppl_dev = get_perplexity(dev_docs, topic_root)
    
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    time_log = int(time.time() - time_start)
    time_start = time.time()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, '%.0f'%ppl_dev, '%.0f'%ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series    
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,690,0,0,973,1236,1020,0.02,0.46,0.45,0.7,0.7
1,786,1,0,924,1169,969,0.02,0.47,0.46,0.74,0.74
2,846,2,0,877,1096,917,0.02,0.49,0.46,0.73,0.72
3,820,3,0,836,1045,876,0.02,0.5,0.46,0.7,0.7
4,792,4,0,807,1014,847,0.03,0.52,0.45,0.65,0.64
5,870,5,0,783,992,825,0.03,0.52,0.46,0.62,0.62
6,877,6,0,763,974,805,0.04,0.52,0.46,0.61,0.6
7,928,7,0,747,957,791,0.04,0.52,0.45,0.64,0.64
8,906,8,0,736,944,779,0.04,0.51,0.45,0.64,0.63
9,932,9,0,727,935,772,0.05,0.51,0.46,0.63,0.63


 0 9006 250438.0 write get article one like know use go think make
   0-1 2148 77758.0 game team year go play player get win think season
     0-1-1 6 1.0 remind bhj series dave knowledge prevent james throw washington legal
     0-1-2 1018 1011.0 dod motorcycle annual double sin ride wave bruce club pin
     0-1-3 1124 488.0 converter tank suppose copyright newspaper two day light auto united
   0-2 1275 116763.0 use file key program system window available image information encryption
     0-2-1 574 922.0 criminal steal brad drug warrant threat pen wiretap phone installation
     0-2-2 645 273.0 ask greece flight rob behavior chapter georgia fax pop uucp
     0-2-3 56 10.0 far display bunch somewhat indeed weight reference argue leave meg
   0-3 9 982.0 rg oo eus mr mw excellent mi mg module ax
     0-3-1 9 0.0 bhj series ram dave knowledge prevent james throw washington legal
   0-4 54 5637.0 pt la period vs pp van gm det play appear
     0-4-1 54 29.0 army make re et jon port imho 