In [38]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import sys
import argparse
import subprocess
import pdb
import time
import random
import _pickle as cPickle
import matplotlib.pyplot as plt

%matplotlib inline

import numpy as np
import pandas as pd
import tensorflow as tf

from data_structure import get_batches
from hntm import HierarchicalNeuralTopicModel
from tree import get_descendant_idxs
from evaluation import validate, get_topic_specialization, get_hierarchical_affinity, print_topic_sample
from coherence import compute_word_count, compute_coherence
from configure import get_config
from ncrp import get_docs, get_freq_tokens_ncrp

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [90]:
def load_model(config, name_model, nb_name, index=-1):
    dir_model = os.path.join('model', config.data, name_model, ''.join(nb_name.split()[1:]))
    ckpt = cPickle.load(open(os.path.join(dir_model, 'checkpoint'), 'rb'))
    path_restore = ckpt[index]
    print('loading %s...' % path_restore)
    
    if name_model == 'hntm':
        path_config = path_restore + '.config'
        config = cPickle.load(open(path_config, 'rb'))
        model = HierarchicalNeuralTopicModel(config)
        sess = tf.Session()
        saver = tf.train.Saver()
        saver.restore(sess, path_restore)         
        return sess, model, config
    elif name_model == 'ncrp':
        test_docs, topic_root = cPickle.load(open(path_restore, 'rb'))
        config = get_config(nb_name)
        return test_docs, topic_root, config

In [81]:
def get_freq_tokens(sess, model, bow_idxs, idx_to_word, topic_freq_tokens=None, parent_idx=0, depth=0):
    if depth == 0:
        topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :10]
        topics_freq_idxs = bow_idxs[topics_freq_indices]
        topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
        
        # print root
        freq_tokens = topic_freq_tokens[parent_idx]
        print(parent_idx, ' '.join(freq_tokens))
    
    child_idxs = model.tree_idxs[parent_idx]
    depth += 1
    for child_idx in child_idxs:
        freq_tokens = topic_freq_tokens[child_idx]
        print('  '*depth, child_idx, ' '.join(freq_tokens))
        
        if child_idx in model.tree_idxs: 
            get_freq_tokens(sess, model, bow_idxs, idx_to_word, topic_freq_tokens=topic_freq_tokens, parent_idx=child_idx, depth=depth)
            
    return topic_freq_tokens

# bags

## load data

In [34]:
nb_name_base = '0 bags'
config_bags = get_config(nb_name_base)
_, _, instances_bags, word_to_idx_bags, idx_to_word_bags, bow_idxs_bags = cPickle.load(open(config_bags.path_data,'rb'))

## restore hntm

In [75]:
if 'sess' in globals(): sess.close()
# sess, model_bags_hntm, config_bags_hntm = load_model(config=config_bags, name_model = 'hntm', nb_name = '1  bags -tree 33 -temp 1 -seed 0', index=-1)
sess, model_bags_hntm, config_bags_hntm = load_model(config=config_bags, name_model = 'hntm', nb_name = '2 bags -tree 33 -temp 10 -seed 0', index=-1)
# sess, model_bags_hntm, config_bags_hntm = load_model(config=config_bags, name_model = 'hntm', nb_name = '3 bags -tree 33 -temp 10 -seed 0 -min', index=-1)
log_bags_hntm = cPickle.load(open(model_bags_hntm.config.path_log, 'rb'))
display(log_bags_hntm[-10:])
freq_tokens_bags_hntm = get_freq_tokens(sess, model_bags_hntm)
coherence_bags_hntm = compute_coherence(freq_tokens_bags_hntm.values(), config_bags.dir_corpus, topns=[5, 10])

INFO:tensorflow:Restoring parameters from model/bags/hntm/bags-tree33-temp10-seed0/model-465000


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0,SPEC:,Unnamed: 17_level_0,Unnamed: 18_level_0,HIER:,Unnamed: 20_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL,1,2,3,CHILD,OTHER
450000,51,901,400,109.79,413,107.52,2.27,0.01,103.06,404,100.78,2.28,0.0,101.08,397,0.33,0.62,0.61,0.24,0.05
455000,59,911,410,109.79,412,107.52,2.27,0.01,103.03,402,100.72,2.31,0.0,101.08,397,0.33,0.62,0.64,0.23,0.06
460000,53,921,420,109.79,412,107.51,2.27,0.01,103.04,403,100.75,2.29,0.0,101.08,397,0.33,0.62,0.61,0.24,0.05
465000,57,931,430,109.78,412,107.51,2.27,0.01,103.0,401,100.68,2.32,0.0,101.07,396,0.33,0.62,0.64,0.23,0.06
470000,52,941,440,109.78,412,107.51,2.27,0.01,103.03,403,100.74,2.29,0.0,101.07,396,0.33,0.62,0.61,0.24,0.05
475000,55,951,450,109.78,412,107.5,2.28,0.01,102.99,402,100.69,2.3,0.0,101.07,396,0.33,0.61,0.64,0.23,0.06
480000,68,961,460,109.78,412,107.5,2.28,0.01,103.01,403,100.72,2.28,0.0,101.07,396,0.33,0.61,0.61,0.23,0.05
485000,63,971,470,109.78,412,107.5,2.28,0.0,103.0,402,100.68,2.32,0.0,101.07,396,0.33,0.61,0.64,0.21,0.06
490000,54,981,480,109.77,412,107.49,2.28,0.0,103.03,404,100.75,2.29,0.0,101.07,396,0.33,0.62,0.62,0.23,0.05
495000,57,991,490,109.77,412,107.49,2.28,0.0,103.0,402,100.69,2.32,0.0,101.07,396,0.33,0.61,0.64,0.22,0.06


0 quality price bought 'm ... time - made buy nice
   1 pocket small inside nice ipad side pockets carry inch strap
     11 carry pockets shoulder compartments straps room strap comfortable back pack
     14 room pocket mouse carry power charger cords phone cord cables
     12 mouse netbook power cord drive usb charger adapter acer sleeve
   4 sleeve protection air pro protect neoprene smell inside snug inch
     42 ; & perfectly perfect love inside hp dell ! big
   5 mac pro perfectly air protect recommend protects book love cover
     52 ! love perfect recommend absolutely ... buy highly awesome loves
   2 cover bottom keyboard top hard screen plastic speck easily scratches
     21 color cover keyboard love picture pink blue perfectly purple ordered
Average Topic Coherence = 0.121
Median Topic Coherence = 0.125


## restore ncrp

In [76]:
docs_bags, topic_bags, config_bags_ncrp = load_model(config=config_bags, name_model = 'ncrp', nb_name = '0 bags -m ncrp -alp 1 0.5 0.1 -eta 5 -gam 0.01 -epoch 30', index=-1)
log_bags_ncrp = cPickle.load(open(config_bags_ncrp.path_log, 'rb'))
display(log_bags_ncrp[-10:])
freq_tokens_bags_ncrp = get_freq_tokens_ncrp(topic_bags, idx_to_word_bags, bow_idxs_bags)
coherence_bags_ncrp = compute_coherence(freq_tokens_bags_ncrp.values(), config_bags.dir_corpus, topns=[5, 10])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
20,188,20,0,381,376,373,0.11,0.38,0.55,0.54,0.44
21,187,21,0,380,377,373,0.11,0.37,0.55,0.52,0.43
22,194,22,0,378,375,368,0.11,0.36,0.54,0.5,0.42
23,190,23,0,373,372,364,0.11,0.36,0.55,0.48,0.38
24,203,24,0,369,369,359,0.11,0.4,0.56,0.56,0.45
25,207,25,0,367,366,356,0.11,0.4,0.55,0.56,0.45
26,207,26,0,365,364,355,0.11,0.41,0.56,0.54,0.44
27,207,27,0,364,364,354,0.11,0.41,0.56,0.52,0.43
28,208,28,0,363,362,355,0.11,0.41,0.56,0.51,0.42
29,207,29,0,361,360,355,0.11,0.42,0.56,0.49,0.4


 0 31943 236572.0 ! ; & nice price quality bought - inside perfect
   0-1 3563 34282.0 ! love color cover price recommend perfect buy ... ordered
     0-1-1 3563 1524.0 cards memory arrived sd easy highly months black card pink
   0-2 9873 89119.0 carry pockets room ! plenty back compartment pocket shoulder strap
     0-2-1 9844 7884.0 cards camera small memory canon lenses lens sd top space
     0-2-2 29 9.0 waterproof recommend snaps portable roomy caught carry-on found fine finger
   0-3 6338 69343.0 cover ! color keyboard love bottom mac pro hard easy
     0-3-1 5115 10211.0 bottom top feet part plastic months corners speck piece cracked
     0-3-2 1223 502.0 dont brown reasonable clean items ordering hands leaves middle material
   0-4 6985 51662.0 sleeve pro air smell cover zipper protection protect inch hard
     0-4-1 1501 2329.0 bottom plastic corners rubber review top months week corner speck
     0-4-2 5484 5874.0 foam protection memory sleeve neoprene netbook zipper cards b

# 20news

## load data

In [79]:
nb_name_base = '0 20news'
config_20news = get_config(nb_name_base)
_, _, instances_20news, word_to_idx_20news, idx_to_word_20news, bow_idxs_20news = cPickle.load(open(config_20news.path_data,'rb'))

## restore hntm

In [87]:
if 'sess' in globals(): sess.close()
sess, model_20news_hntm, config_20news_hntm = load_model(config=config_20news, name_model = 'hntm', nb_name = '1  20news -tree 33 -temp 1 -seed 0', index=-2)
# sess, model_20news_hntm, config_20news_hntm = load_model(config=config_20news, name_model = 'hntm', nb_name = '2 20news -tree 33 -temp 10 -seed 0', index=-1)
# sess, model_20news_hntm, config_20news_hntm = load_model(config=config_20news, name_model = 'hntm', nb_name = '3 20news -tree 33 -temp 1 -seed 0 -min', index=-1)
log_20news_hntm = cPickle.load(open(model_20news_hntm.config.path_log, 'rb'))
display(log_20news_hntm[-10:])
freq_tokens_20news_hntm = get_freq_tokens(sess, model_20news_hntm, bow_idxs_20news, idx_to_word_20news)
coherence_20news_hntm = compute_coherence(freq_tokens_20news_hntm.values(), config_20news.dir_corpus, topns=[5, 10])

INFO:tensorflow:Restoring parameters from model/20news/hntm/20news-tree33-temp1-seed0/model-135000


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0,SPEC:,Unnamed: 17_level_0,Unnamed: 18_level_0,HIER:,Unnamed: 20_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL,1,2,3,CHILD,OTHER
130000,60,742,149,590.12,791,585.52,4.5,0.06,566.38,818,562.0,4.37,0.02,566.11,817,0.11,0.46,0.47,0.35,0.26
135000,71,771,74,590.06,790,585.46,4.51,0.06,566.06,814,561.51,4.46,0.09,566.04,815,0.11,0.47,0.48,0.36,0.28
140000,60,799,174,590.09,790,585.49,4.51,0.06,566.23,818,561.85,4.36,0.01,566.04,815,0.12,0.44,0.48,0.37,0.27
145000,70,828,99,589.96,789,585.36,4.52,0.06,566.12,815,561.61,4.42,0.09,566.04,815,0.12,0.46,0.48,0.36,0.28
150000,66,857,24,589.98,789,585.39,4.52,0.06,566.23,815,561.76,4.41,0.06,566.04,815,0.13,0.47,0.46,0.38,0.29
155000,64,885,124,589.86,788,585.27,4.52,0.06,566.02,814,561.55,4.43,0.04,566.04,815,0.12,0.43,0.48,0.37,0.27
160000,60,914,49,589.88,788,585.29,4.53,0.06,566.19,816,561.76,4.42,0.01,566.04,815,0.13,0.46,0.48,0.34,0.26
165000,70,942,149,589.81,788,585.22,4.53,0.06,566.13,816,561.62,4.44,0.08,566.04,815,0.11,0.46,0.47,0.37,0.28
170000,62,971,74,589.77,787,585.18,4.53,0.06,566.13,814,561.72,4.4,0.01,566.2,815,0.11,0.47,0.47,0.35,0.26
175000,70,999,174,589.78,787,585.19,4.54,0.06,565.91,814,561.38,4.46,0.08,566.2,815,0.12,0.45,0.47,0.37,0.29


0 write article get like one think know go make good
   1 god one say people jesus christian believe think write make
     11 say go people one come see know kill woman tell
     13 game team play player win year season hockey go league
     14 people gun israel state law government write article israeli right
   2 space launch use nasa satellite research new orbit system science
     23 turkish armenian people armenians say stephanopoulos president turkey armenia turks
     21 tax people president say go government year money pay clinton
   5 available include software send list use mail image information file
     53 key use chip one encryption system phone clipper government number
   3 use file window program windows server application run display set
     31 use system drive disk one work computer need chip card
     32 file use encryption device information technology law new government protect
   4 window use file program windows run application display server set
     41 drive 

## restore ncrp

In [93]:
# docs_20news, topic_20news, config_20news_ncrp = load_model(config=config_20news, name_model = 'ncrp', nb_name = '0 20news -m ncrp -alp 1 0.5 0.1 -eta 10 -gam 0.01 -epoch 50', index=-5)
docs_20news, topic_20news, config_20news_ncrp = load_model(config=config_20news, name_model = 'ncrp', nb_name = '0 20news -m ncrp -alp 1 0.5 0.1 -eta 10 -gam 0.001 -epoch 50', index=-1)

log_20news_ncrp = cPickle.load(open(config_20news_ncrp.path_log, 'rb'))
display(log_20news_ncrp[-10:])
freq_tokens_20news_ncrp = get_freq_tokens_ncrp(topic_20news, idx_to_word_20news, bow_idxs_20news)
coherence_20news_ncrp = compute_coherence(freq_tokens_20news_ncrp.values(), config_20news.dir_corpus, topns=[5, 10])

loading model/20news/ncrp/20news-mncrp-alp10.50.1-eta10-gam0.001-epoch50/model-49...


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,VALID:,TEST:,SPEC:,Unnamed: 8_level_0,Unnamed: 9_level_0,HIER:,Unnamed: 11_level_0
Unnamed: 0_level_1,Time,Ep,Ct,PPL,PPL,PPL,1,2,3,CHILD,OTHER
40,633,40,0,768,784,784,0.06,0.41,0.46,0.66,0.65
41,626,41,0,768,784,783,0.07,0.41,0.46,0.66,0.65
42,501,42,0,768,784,783,0.07,0.41,0.46,0.66,0.64
43,617,43,0,768,784,782,0.07,0.41,0.46,0.66,0.64
44,628,44,0,768,783,782,0.07,0.41,0.46,0.66,0.64
45,628,45,0,767,782,781,0.07,0.41,0.46,0.66,0.64
46,507,46,0,767,783,781,0.07,0.4,0.46,0.66,0.64
47,506,47,0,768,783,781,0.07,0.4,0.46,0.66,0.64
48,525,48,0,767,783,781,0.07,0.41,0.46,0.69,0.68
49,650,49,0,767,782,781,0.07,0.41,0.46,0.69,0.68


 0 11258 367392.0 write one article say know think get people like make
   0-1 3435 179119.0 people say go government one get gun make state think
     0-1-1 899 1941.0 dog bike ride motorcycle car road moon traffic speed screw
     0-1-2 1328 5024.0 jews jewish nazi german history islamic document jew germany islam
     0-1-3 975 1398.0 nsa satellite helmet stone warrant environment per insurance average sp
     0-1-4 45 81.0 gay director leader free bar recall tie ignorance distribute along
     0-1-5 13 60.0 installation secret village island south america north british regular port
     0-1-6 78 48.0 resource switch parent lewis toward refer threat house answer suggestion
     0-1-7 97 93.0 thought cancer discussion flight stage institute charge highly consider hope
   0-2 3160 172764.0 use file get program drive window system write run problem
     0-2-1 3075 1407.0 ted mon university account date apr chicago engineer id satellite
     0-2-2 85 22.0 update allen even manufacturer 