In [2]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [12]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs, get_sum_cnt_words
from configure import get_config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load config & data 

In [4]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [5]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(31943, 1035)

In [7]:
docs_bow = [instance.bow for instance in instances_train]
docs_raw = [[[bow_index]*int(doc_bow[bow_index]) for bow_index in np.where(doc_bow > 0)[0]] for doc_bow in docs_bow]
docs_words = [[idx for idxs in doc for idx in idxs] for doc in docs_raw]
np.sum([len(doc_words) for doc_words in docs_words])

568401

# run

## initialize log

In [8]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [9]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 20000 30000 0 0 

## run

In [10]:
time_start = time.time()
while epoch < config.n_epochs:
    sample(train_docs, dev_docs, test_docs, topic_root)
    ppl_train = get_perplexity(train_docs, topic_root)
    ppl_dev = get_perplexity(dev_docs, topic_root)
    
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    time_log = int(time.time() - time_start)
    time_start = time.time()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, '%.0f'%ppl_dev, '%.0f'%ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series    
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
0,222,0,0,367,349,348,0.09,0.5,0.64,0.47,0.34
1,269,1,0,328,320,322,0.1,0.53,0.66,0.44,0.29
2,302,2,0,317,315,312,0.1,0.52,0.67,0.46,0.29
3,324,3,0,306,305,304,0.1,0.54,0.65,0.44,0.29
4,342,4,0,298,294,296,0.1,0.54,0.65,0.37,0.27
5,356,5,0,291,293,288,0.11,0.56,0.66,0.4,0.28


 0 31943 280996.0 ! nice bought price quality love made perfect recommend put
   0-1 2021 14305.0 ! love color cover mac perfectly recommend pro apple easy
     0-1-2 162 322.0 keyboard cover pink green purple blue lighter darker hot picture
     0-1-4 65 53.0 dark gift negative blue black file complaint beautiful husband sizes
     0-1-5 1794 1753.0 cover keyboard perfectly smell purple kuzy typing packaging texture keys
   0-2 778 6616.0 carry work travel back airport pack handle seat security easy
     0-2-1 17 17.0 falls comfortably update purple wheels binder rain returned cost highly
     0-2-2 747 1044.0 camera roll equipment lenses trip compartments wheels travelling friendly compact
     0-2-3 14 9.0 sit sits floor messenger higher overnight stay recently personal feeling
   0-3 1763 13307.0 pocket sleeve power mouse netbook charger room small inch extra
     0-3-1 66 67.0 travelling usb accessory items suggest personally drives safe nice part
     0-3-2 178 225.0 inch toshiba

KeyboardInterrupt: 

In [13]:
get_sum_cnt_words(topic_root)

568401.0

In [15]:
topic = topic_root
doc = train_docs[0]
from scipy.special import gammaln

if len(topic.children) > 0:
    children_cnt_words = np.concatenate([np.array([child.cnt_words for child in topic.children]), np.zeros([1, topic.config.n_vocab])], 0) # (Children+1) x Vocabulary
else:
    children_cnt_words = np.zeros([1, topic.config.n_vocab]) # (Children+1) x Vocabulary

cnt_words_doc = doc.cnt_words[None, :] # 1 x Vocabulary

logits_child_likelihood = gammaln(np.sum(children_cnt_words, -1) + topic.config.n_vocab*topic.config.eta) \
                    - np.sum(gammaln(children_cnt_words + topic.config.eta), -1) \
                    - gammaln(np.sum(children_cnt_words + cnt_words_doc, -1) + topic.config.n_vocab*topic.config.eta) \
                    + np.sum(gammaln(children_cnt_words + cnt_words_doc + topic.config.eta), -1)
s_child_likelihood = np.exp(logits_child_likelihood)

s_child_prior = [child.cnt_doc for child in topic.children]
s_child_prior += [topic.config.gam]
logits_child_prior = np.log(s_child_prior)

In [17]:
s_child_likelihood * s_child_prior / np.sum(s_child_likelihood * s_child_prior)

array([8.24841793e-22, 4.82464953e-14, 7.66076645e-11, 1.71026880e-12,
       1.60369016e-18, 3.16964611e-16, 1.40075187e-13, 2.67633165e-14,
       3.85705196e-11, 8.59807887e-07, 1.79931112e-03, 6.24280982e-06,
       5.12144522e-11, 9.97551412e-01, 6.38559096e-04, 4.15368429e-08,
       3.57330384e-06])

In [18]:
logits_child = logits_child_likelihood + logits_child_prior
s_child = np.exp(logits_child)
s_child/np.sum(s_child)

array([8.24841793e-22, 4.82464953e-14, 7.66076645e-11, 1.71026880e-12,
       1.60369016e-18, 3.16964611e-16, 1.40075187e-13, 2.67633165e-14,
       3.85705196e-11, 8.59807887e-07, 1.79931112e-03, 6.24280982e-06,
       5.12144522e-11, 9.97551412e-01, 6.38559096e-04, 4.15368429e-08,
       3.57330384e-06])

In [20]:
p_child = np.zeros_like(logits_child)
p_child[np.argmax(s_child)] = 1
p_child

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.])

In [None]:
def get_logits_child_prior(topic):
    s_child_prior = [child.cnt_doc for child in topic.children]
    s_child_prior += [topic.config.gam]
    logits_child_prior = np.log(s_child_prior, dtype=np.float128)
    return logits_child_prior

def get_logits_child_likelihood(topic, doc):
    if len(topic.children) > 0:
        children_cnt_words = np.concatenate([np.array([child.cnt_words for child in topic.children]), np.zeros([1, topic.config.n_vocab])], 0) # (Children+1) x Vocabulary
    else:
        children_cnt_words = np.zeros([1, topic.config.n_vocab]) # (Children+1) x Vocabulary

    cnt_words_doc = doc.cnt_words[None, :] # 1 x Vocabulary

    logits_child_likelihood = gammaln(np.sum(children_cnt_words, -1) + topic.config.n_vocab*topic.config.eta) \
                        - np.sum(gammaln(children_cnt_words + topic.config.eta), -1) \
                        - gammaln(np.sum(children_cnt_words + cnt_words_doc, -1) + topic.config.n_vocab*topic.config.eta) \
                        + np.sum(gammaln(children_cnt_words + cnt_words_doc + topic.config.eta), -1)
    return logits_child_likelihood

def get_probs_child(topic, doc):
    logits_child_prior = get_logits_child_prior(topic)
    logits_child_likelihood = get_logits_child_likelihood(topic, doc)
    logits_child = logits_child_prior + logits_child_likelihood
    
    logits_child -= np.min(logits_child)
    s_child = np.exp(logits_child)

    p_child = s_child/np.sum(s_child)
#     if np.sum(s_child) > 0:
#         p_child = s_child/np.sum(s_child)
#         p_child = p_child.astype(np.float64)
#     else:
#         p_child = np.zeros_like(logits_child, dtype=np.float64)
#         p_child[np.argmax(logits_child)] = 1.
        
    return p_child