In [6]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
os.environ['PYTHONHASHSEED'] = '0'
os.environ["NUMEXPR_NUM_THREADS"] = "1"
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import sys
import argparse
import subprocess
import pdb
import time
import random
import _pickle as cPickle
import matplotlib.pyplot as plt

%matplotlib inline

import numpy as np
import pandas as pd
import tensorflow as tf

from data_structure import get_batches
from hntm import HierarchicalNeuralTopicModel
from tree import get_descendant_idxs
from evaluation import validate, get_topic_specialization, get_hierarchical_affinity, print_topic_sample
from coherence import compute_word_count, compute_coherence
from configure import get_config
from ncrp import get_docs

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
os.environ["CUDA_VISIBLE_DEVICES"] = '0'

In [16]:
def load_model(config, name_model, nb_name):
    dir_model = os.path.join('model', config.data, name_model, ''.join(nb_name.split()[1:]))
    ckpt = cPickle.load(open(os.path.join(dir_model, 'checkpoint'), 'rb'))
    path_restore = ckpt[-1]
    path_config = path_restore + '.config'
    config = cPickle.load(open(path_config, 'rb'))
    if name_model == 'hntm':
        model = HierarchicalNeuralTopicModel(config)
        sess = tf.Session()
        saver = tf.train.Saver()
        saver.restore(sess, path_restore)         
        return sess, model
    elif name_model == 'ncrp':
        test_docs, topic_root = cPickle.load(open(path_restore))
        return test_docs, topic_root

In [None]:
def load_ncrp(config, nb_name):
    dir_model = os.path.join('model', config.data, name_model, ''.join(nb_name.split()[1:]))
    cPickle.load(open())

In [4]:
def print_freq_tokens(sess, model, topic_freq_tokens=None, parent_idx=0, depth=0):
    if depth == 0:
        topics_freq_indices = np.argsort(sess.run(model.topic_bow), 1)[:, ::-1][:, :10]
        topics_freq_idxs = bow_idxs[topics_freq_indices]
        topic_freq_tokens = {topic_idx: [idx_to_word[idx] for idx in topic_freq_idxs] for topic_idx, topic_freq_idxs in zip(model.topic_idxs, topics_freq_idxs)}
        
        # print root
        freq_tokens = topic_freq_tokens[parent_idx]
        print(parent_idx, ' '.join(freq_tokens))
    
    child_idxs = model.tree_idxs[parent_idx]
    depth += 1
    for child_idx in child_idxs:
        freq_tokens = topic_freq_tokens[child_idx]
        print('  '*depth, child_idx, ' '.join(freq_tokens))
        
        if child_idx in model.tree_idxs: 
            print_freq_tokens(sess, model, topic_freq_tokens=topic_freq_tokens, parent_idx=child_idx, depth=depth)
            
    return topic_freq_tokens

# bags

## load data

In [8]:
nb_name_base = '0 bags'
config_bags = get_config(nb_name_base)
_, _, instances_bags, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config_bags.path_data,'rb'))
bags_batches = get_batches(instances_bags, batch_size=config_bags.batch_size)

## restore hntm

In [14]:
if 'sess' in globals(): sess.close()
sess, model_bags = load_model(config=config_bags, name_model = 'hntm', nb_name = '1  bags -tree 33 -temp 1 -seed 0')
log_bags_hntm = cPickle.load(open(model_bags.config.path_log, 'rb'))
log_bags_hntm[-10:]

INFO:tensorflow:Restoring parameters from model/bags/hntm/bags-tree33-temp1-seed0/model-475000


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,TRAIN:,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Unnamed: 8_level_0,VALID:,Unnamed: 10_level_0,Unnamed: 11_level_0,Unnamed: 12_level_0,Unnamed: 13_level_0,TEST:,Unnamed: 15_level_0,SPEC:,Unnamed: 17_level_0,Unnamed: 18_level_0,HIER:,Unnamed: 20_level_0
Unnamed: 0_level_1,Time,Ep,Ct,LOSS,PPL,NLL,KL,REG,LOSS,PPL,NLL,KL,REG,LOSS,PPL,1,2,3,CHILD,OTHER
450000,54,901,400,109.66,406,107.24,2.41,0.0,103.0,402,100.66,2.34,0.0,101.2,402,0.27,0.61,0.61,0.14,0.09
455000,59,911,410,109.66,406,107.23,2.42,0.0,102.93,400,100.53,2.4,0.0,101.2,402,0.27,0.61,0.58,0.16,0.11
460000,52,921,420,109.66,406,107.23,2.42,0.0,103.0,402,100.65,2.35,0.0,101.2,402,0.27,0.6,0.6,0.15,0.09
465000,58,931,430,109.66,406,107.23,2.42,0.0,102.86,398,100.46,2.4,0.0,101.2,402,0.26,0.61,0.58,0.15,0.11
470000,53,941,440,109.65,406,107.22,2.42,0.0,102.92,400,100.57,2.36,0.0,101.2,402,0.27,0.61,0.61,0.14,0.09
475000,58,951,450,109.65,406,107.22,2.42,0.0,102.85,397,100.45,2.4,0.0,101.14,400,0.25,0.61,0.58,0.15,0.1
480000,50,961,460,109.65,406,107.21,2.42,0.0,102.91,400,100.56,2.35,0.0,101.14,400,0.26,0.6,0.61,0.13,0.09
485000,59,971,470,109.65,406,107.21,2.43,0.0,102.84,398,100.44,2.41,0.0,101.14,400,0.26,0.61,0.58,0.14,0.1
490000,53,981,480,109.65,406,107.21,2.43,0.0,102.93,401,100.57,2.36,0.0,101.14,400,0.26,0.61,0.61,0.13,0.09
495000,60,991,490,109.65,406,107.2,2.43,0.0,102.87,398,100.46,2.41,0.0,101.14,400,0.25,0.6,0.58,0.16,0.11


In [15]:
freq_tokens_bags_nhtm = print_freq_tokens(sess, model_bags)
coherence_bags_nhtm = compute_coherence(freq_tokens_bags_nhtm.values(), config_bags.dir_corpus, topns=[5, 10])

0 carry pockets ! room back nice love work perfect lot
   1 pocket mouse power netbook small charger inch room cord tablet
     11 & ; pro size quality bought price ... big perfectly
     12 amazon item price $ ordered ... quality buy received bought
   2 sleeve inside protection ipad padding zipper neoprene soft protect nice
     23 ! love color perfect recommend ... cute absolutely awesome compliments
     21 ! pro perfectly love air mac smell perfect recommend color
   4 protection protect - air : scratches hard bit nice pro
     41 cover color apple keyboard love pro mac easy pink purple
     43 bottom cover mac top air rubberized feel rubber easily feet
     42 bottom top cracked corners cover plastic speck piece months part
   5 zipper strap - : open flap side zippers shoulder velcro
     51 months broke years year bought started weeks ago quality handle
Average Topic Coherence = 0.095
Median Topic Coherence = 0.078


## restore ncrp

In [None]:
sess, model_bags = load_model(config=config_bags, name_model = 'ncrp', nb_name = '0 bags -m ncrp -alp 10 5 1 -eta 1 -gam 0.01')