In [3]:
%%javascript
IPython.notebook.kernel.execute('nb_name = "' + IPython.notebook.notebook_name + '"')

<IPython.core.display.Javascript object>

In [4]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from ncrp import Topic, Doc, init, sample, get_perplexity, get_topic_specialization, get_hierarchical_affinities, get_freq_tokens_ncrp, get_docs
from configure import get_config

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# load config & data 

In [5]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [6]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [7]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(31943, 1035)

# run

## initialize log

In [8]:
checkpoint = []
ppl_min = np.inf
epoch = 0

cmd_rm = 'rm -r %s' % config.dir_model
res = subprocess.call(cmd_rm.split())
cmd_mk = 'mkdir %s' % config.dir_model
res = subprocess.call(cmd_mk.split())

log_df = pd.DataFrame(columns=pd.MultiIndex.from_tuples(
                    list(zip(*[['','','','TRAIN:','VALID:','TEST:','SPEC:', '', '', 'HIER:', ''],
                            ['Time','Ep','Ct','PPL','PPL', 'PPL','1', '2', '3', 'CHILD', 'OTHER']]))))

def update_checkpoint(config, checkpoint, epoch):
    checkpoint.append(config.path_model + '-%i' % epoch)
    if len(checkpoint) > config.max_to_keep:
        path_model = checkpoint.pop(0)
        for p in glob.glob(path_model):
            os.remove(p)
    cPickle.dump(checkpoint, open(config.path_checkpoint, 'wb'))

## initialize data

In [9]:
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, config=config)
train_docs = get_docs(instances_train, config)
dev_docs = get_docs(instances_dev, config)
test_docs = get_docs(instances_test, config)
init(train_docs, dev_docs, test_docs, topic_root)

0 10000 20000 30000 0 0 

## run

In [10]:
time_start = time.time()
while epoch < config.n_epochs:
    sample(train_docs, dev_docs, test_docs, topic_root)
    ppl_train = get_perplexity(train_docs, topic_root)
    ppl_dev = get_perplexity(dev_docs, topic_root)
    
    if ppl_dev < ppl_min:
        ppl_min = ppl_dev
        ppl_test = get_perplexity(test_docs, topic_root)
        cPickle.dump([test_docs, topic_root], open(config.path_model + '-%i'%epoch, 'wb'))
        update_checkpoint(config, checkpoint, epoch)
        
    depth_spec = get_topic_specialization(test_docs, topic_root)
    hierarchical_affinities = get_hierarchical_affinities(topic_root)
    
    clear_output()
    time_log = int(time.time() - time_start)
    time_start = time.time()
    log_series = pd.Series([time_log, epoch, 0, \
            '%.0f'%ppl_train, '%.0f'%ppl_dev, '%.0f'%ppl_test, \
            '%.2f'%depth_spec[1], '%.2f'%depth_spec[2], '%.2f'%depth_spec[3], \
            '%.2f'%hierarchical_affinities[0], '%.2f'%hierarchical_affinities[1]],
            index=log_df.columns)
    log_df.loc[epoch] = log_series    
    display(log_df)
    get_freq_tokens_ncrp(topic_root, idx_to_word, bow_idxs)
    
    cPickle.dump(log_df, open(config.path_log, 'wb'))
    epoch += 1

0 10000 20000 30000 0 0 

ValueError: shapes (1035,) and (0,) not aligned: 1035 (dim 0) != 0 (dim 0)

In [12]:
def get_topics(topic, topics=None):
    if topics is None: topics=[]
    topics.append(topic)
    for child in topic.children:
        topics = get_topics(child, topics)
    return topics

def get_cos_sim(parent_childs):
    parent_child_bows = {parent: np.array([child.prob_words/np.linalg.norm(child.prob_words) for child in childs]) for parent, childs in parent_childs.items()}
    cos_sim = np.mean([np.mean((parent.prob_words/np.linalg.norm(parent.prob_words)).dot(child_bows.T)) for parent, child_bows in parent_child_bows.items()])
    return cos_sim    

topics = get_topics(topic_root)
second_parents = [topic for topic in topics if topic.depth==1]
third_childs = [topic for topic in topics if topic.depth==2]
second_parent_childs = {parent: parent.children for parent in second_parents}
second_parent_unchilds = {parent: [child for child in third_childs if child not in second_parent_childs[parent]] for parent in second_parents}
if sum(len(unchilds) for unchilds in second_parent_unchilds.values()) > 0:
    child_cos_sim = get_cos_sim(second_parent_childs)
    unchild_cos_sim = get_cos_sim(second_parent_unchilds)
else:
    child_cos_sim = get_cos_sim(second_parent_childs)
    unchild_cos_sim = 0
return child_cos_sim, unchild_cos_sim

ValueError: shapes (1035,) and (0,) not aligned: 1035 (dim 0) != 0 (dim 0)

In [13]:
second_parent_childs

{<ncrp.Topic at 0x7f7d32752f60>: [<ncrp.Topic at 0x7f7d43cd9fd0>]}

0