In [1]:
import os
import pdb
import _pickle as cPickle
import time

import numpy as np
import tensorflow as tf

from collections import defaultdict, Counter
from scipy.special import gammaln

import warnings
warnings.filterwarnings('error')

# data 

In [2]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('data_path', 'data/bags/instances.pkl', 'path of data')

flags.DEFINE_integer('n_depth', 3, 'depth of tree')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

In [3]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.data_path,'rb'))
docs_bow = [instance.bow for instance in instances_train]
docs_raw = [[[bow_index]*int(doc_bow[bow_index]) for bow_index in np.where(doc_bow > 0)[0]] for doc_bow in docs_bow]
docs_words = [[idx for idxs in doc for idx in idxs] for doc in docs_raw]
words = [word for doc_words in docs_words for word in doc_words]

Exception ignored in: <_io.FileIO name='data/bags/instances.pkl' mode='rb' closefd=True>


In [4]:
test_docs_bow = [instance.bow for instance in instances_test]
test_docs_raw = [[[bow_index]*int(test_doc_bow[bow_index]) for bow_index in np.where(test_doc_bow > 0)[0]] for test_doc_bow in test_docs_bow]
test_docs_words = [[idx for idxs in doc for idx in idxs] for doc in test_docs_raw][:5000]

In [5]:
n_doc = len(docs_words)
n_vocab = len(np.unique(words))
n_words = len(words)
assert n_vocab == len(bow_idxs)
n_doc, n_vocab, n_words

(31943, 1035, 568401)

# method

## assign docs to tree

In [6]:
class Topic:
    def __init__(self, idx, sibling_idx, parent, depth, n_doc, n_vocab):
        self.idx = idx
        self.sibling_idx = sibling_idx
        self.parent = parent
        self.children = []
        self.depth = depth
        self.cnt_doc = 0
        self.n_doc = n_doc
        self.n_vocab = n_vocab
        self.cnt_words = np.zeros(n_vocab) # Number of Words over Documents
        self.set_prob_words()
        self.verbose = verbose
    
    def sample_child(self, doc, train=True):
        s_child_prior = self.get_s_child_prior(gam)
        s_child_likelihood = self.get_s_child_likelihood(doc, eta)
        p_child = np.array(s_child_prior * s_child_likelihood) / np.sum(s_child_prior * s_child_likelihood)
        
        child_index = np.random.multinomial(1, p_child).argmax()
        if verbose: print('Depth: ', self.depth, 'p_child: ', p_child, 'selected:', child_index)
        
        if child_index < len(self.children):
            child = self.children[child_index]
        else:
            child = self.get_new_child()
            if train: self.children += [child]
        return child
    
    def init_sample_child(self, train=True):
        s_child_prior = self.get_s_child_prior(gam)
        p_child = np.array(s_child_prior) / np.sum(s_child_prior)
        
        child_index = np.random.multinomial(1, p_child).argmax()
        if verbose: print('Depth: ', self.depth, 'p_child: ', p_child, 'selected:', child_index)

        if child_index < len(self.children):
            child = self.children[child_index]
        else:
            child = self.get_new_child()
            if train: self.children += [child]
        return child
    
    def get_probs_child(self, doc):
        s_child_prior = self.get_s_child_prior(gam)
        s_child_likelihood = self.get_s_child_likelihood(doc, eta)
        p_child = np.array(s_child_prior * s_child_likelihood) / np.sum(s_child_prior * s_child_likelihood)
        return p_child
    
    def get_s_child_prior(self, gam):
        s_child_prior = [child.cnt_doc for child in self.children]
        s_child_prior += [gam]
        return s_child_prior
    
    def get_s_child_likelihood(self, doc, eta):
        if len(self.children) > 0:
            children_cnt_words = np.concatenate([np.array([child.cnt_words for child in self.children]), np.zeros([1, self.n_vocab])], 0) # (Children+1) x Vocabulary
        else:
            children_cnt_words = np.zeros([1, self.n_vocab]) # (Children+1) x Vocabulary
        
        cnt_words_doc = doc.cnt_words[None, :] # 1 x Vocabulary

        logits_likelihood = gammaln(np.sum(children_cnt_words, -1) + n_vocab*eta) \
                            - np.sum(gammaln(children_cnt_words + eta), -1) \
                            - gammaln(np.sum(children_cnt_words + cnt_words_doc, -1) + n_vocab*eta) \
                            + np.sum(gammaln(children_cnt_words + cnt_words_doc + eta), -1)
        s_child_likelihood = np.exp(logits_likelihood)
        return s_child_likelihood
    
    def get_new_child(self):
        sibling_idx = max([child.sibling_idx for child in self.children]) + 1 if len(self.children) > 0 else 1
        idx = self.idx + '-' + str(sibling_idx)
        depth = self.depth+1
        child = Topic(idx=idx, sibling_idx=sibling_idx, parent=self, depth=depth, n_doc=self.n_doc, n_vocab=self.n_vocab)        
        return child
        
    def get_children(self):
        child = self.get_new_child()
        children = self.children + [child]
        return children
    
    def delete_topic(self):
        self.parent.children.remove(self)
        
    def set_prob_words(self):
        cnt_words = self.cnt_words + eta
        self.prob_words = cnt_words / np.sum(cnt_words)

In [7]:
class Doc:
    def __init__(self, idx, words, bow, n_depth):
        self.idx = idx
        self.words = words
        self.cnt_words = bow
        assert len(words) == np.sum(bow)
        
        self.topics = [] # Depth
        self.word_depths = [] # Word Indices
        self.depth_cnt_words = np.zeros([n_depth, n_vocab])
                
    def get_probs_depth(self, word_idx):
        s_docs = np.sum(self.depth_cnt_words, -1) + alpha # Depth
        s_words = np.array([topic.cnt_words[word_idx] for topic in self.topics]) + eta # Depth
        z_words = np.array([np.sum(topic.cnt_words) for topic in self.topics]) + n_vocab*eta # Depth
        assert s_docs.shape == s_words.shape == z_words.shape

        s_depths = s_docs*s_words/z_words
        p_depths = s_depths/np.sum(s_depths) # Depth
        return p_depths
    
    def sample_depth(self, word_idx):
        prob_depths = self.get_probs_depth(word_idx)
        word_depth = np.argmax(np.random.multinomial(1, prob_depths))
        return word_depth

## sample doc path

$$p({\bf c}_{m}\hspace{0.5ex}|\hspace{0.5ex}{\bf w}, {\bf c}_{-m}, {\bf z})\propto p({\bf w}_{m}\hspace{0.5ex}|\hspace{0.5ex}{\bf c}, {\bf w}_{-m}, {\bf z})\cdot p({\bf c}_{m}\hspace{0.5ex}|\hspace{0.5ex}{\bf c}_{-m})$$

$$p({\bf w}_{m}\hspace{0.5ex}|\hspace{0.5ex}{\bf c}, {\bf w}_{-m}, {\bf z})=\prod_{\ell=1}^{L}\left(\frac{\Gamma(n_{c_{m,\ell},-m}^{(\cdot)}+W\eta)}{\prod_{w}\Gamma(n_{c_{m,\ell},-m}^{(w)}+\eta)}\frac{\prod_{w}\Gamma(n_{c_{m,\ell},-m}^{(w)}+n_{c_{m,\ell},m}^{(w)}+\eta)}{\Gamma(n_{c_{m,\ell},-m}^{(\cdot)}+n_{c_{m,\ell},m}^{(\cdot)}+W\eta)}\right)$$

In [8]:
def init_doc_topics(docs, topic_root, train=True):
    for doc in docs:
        topic = topic_root
        doc.topics = [topic]
        if train: topic.cnt_doc += 1 # increment count of docs

        for depth in range(1, n_depth):
            topic = topic.init_sample_child(train=train)
            doc.topics += [topic]
            if train: topic.cnt_doc += 1 # increment count of docs

In [9]:
def sample_doc_topics(docs, topic_root, train=True):
    for doc in docs:
        if train:
            for depth in range(1, n_depth):
                topic = doc.topics[depth]
                topic.cnt_doc -= 1 # decrement count of docs
                assert topic.cnt_doc >= 0
                topic.cnt_words -= doc.depth_cnt_words[depth] # decrement count of words
                assert np.min(topic.cnt_words) >= 0

                if topic.cnt_doc == 0: 
                    topic.delete_topic()
                    assert np.sum(topic.cnt_words) == 0

        topic = topic_root
        doc.topics = [topic]
        for depth in range(1, n_depth):
            topic = topic.sample_child(doc, train=train)
            doc.topics += [topic]
            if train: topic.cnt_doc += 1 # increment count of docs
            if train: topic.cnt_words += doc.depth_cnt_words[depth] # increment count of words

## assign words to topics

\begin{align*}
p(z_{i}=j\hspace{0.5ex}|\hspace{0.5ex}{\bf z}_{-i},{\bf w})\propto (n_{-i,j}^{(d_{i})}+\alpha)\frac{n_{-i,j}^{(w_{i})}+\eta}{n_{-i,j}^{(\cdot)}+W\eta}
\end{align*}

In [10]:
def init_word_topics(docs, train=True):
    for doc in docs:
        if doc.idx % 10000 == 0: print(doc.idx, end=' ')
        for word_index, word_idx in enumerate(doc.words):
            # sample depth of word
            new_depth = doc.sample_depth(word_idx)
            new_topic = doc.topics[new_depth]
            
            # increment count of words
            doc.depth_cnt_words[new_depth, word_idx] += 1
            if train: new_topic.cnt_words[word_idx] += 1
            doc.word_depths.append(new_depth) # for reference when sampling
            
        assert len(doc.word_depths) == len(doc.words) == np.sum(doc.depth_cnt_words)

In [11]:
def sample_word_topics(docs, train=True):
    for doc in docs:
        if doc.idx % 10000 == 0: print(doc.idx, end=' ')
        for word_index, word_idx in enumerate(doc.words):
            # refer depth of word
            old_depth = doc.word_depths[word_index]
            old_topic = doc.topics[old_depth]
            
            # decrement count of words
            doc.depth_cnt_words[old_depth, word_idx] -= 1
            if train: old_topic.cnt_words[word_idx] -= 1            
            
            # sample depth of word
            new_depth = doc.sample_depth(word_idx)
            new_topic = doc.topics[new_depth]
            
            # increment count of words
            doc.depth_cnt_words[new_depth, word_idx] += 1
            if train: new_topic.cnt_words[word_idx] += 1
            doc.word_depths[word_index] = new_depth # for sample
            
        assert len(doc.word_depths) == len(doc.words) == np.sum(doc.depth_cnt_words)

## check

In [12]:
def recur_cnt_words(topic):
    cnt_words = np.sum(topic.cnt_words)
    for child in topic.children:
        cnt_words += recur_cnt_words(child)
    return cnt_words
    
def assert_sum_cnt_words(topic_root):
    sum_cnt_words = recur_cnt_words(topic_root)
    assert sum_cnt_words == sum([len(doc.words) for doc in docs])
    
def nearly_equal(val, thre):
    return (val > thre-1e-5) and (val < thre+1e-5)

### init

In [13]:
alpha = np.array([10., 5., 1.])
gam = 0.01
eta = 1.
n_depth = 3
verbose = False

In [14]:
n_sample = 100
docs = [Doc(idx=doc_idx, words=doc_words, bow=doc_bow, n_depth=config.n_depth) for doc_idx, (doc_words, doc_bow) in enumerate(zip(docs_words, docs_bow))]
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, n_doc=n_doc, n_vocab=n_vocab)

### train

In [24]:
def init_train(docs, topic_root):
    init_doc_topics(docs=docs, topic_root=topic_root)
    init_word_topics(docs=docs)
    assert_sum_cnt_words(topic_root)

In [25]:
def sample_train(docs, topic_root):
    sample_doc_topics(docs=docs, topic_root=topic_root)
    sample_word_topics(docs=docs)
    assert_sum_cnt_words(topic_root)

### test

In [26]:
def eval_perplexity(docs, topic_root):
    def set_prob_words(topic):
        topic.set_prob_words()
        for topic_child in topic.children:
            set_prob_words(topic_child)
            
    # set Probabilty of Words
    set_prob_words(topic_root)
    
    logit_docs, n_words = 0, 0
    for doc in docs:
        # Path Probability for each document
        topic = topic_root
        probs_paths= [{topic: 1.}]
        for depth in range(1, n_depth):
            probs_path = {}
            for topic, prob_path in probs_paths[-1].items():
                topics_child = topic.get_children()
                probs_child = topic.get_probs_child(doc)
                probs_path_child = prob_path * probs_child
                for topic_child, prob_path_child in zip(topics_child, probs_path_child):
                    probs_path[topic_child] = prob_path_child
            probs_paths.append(probs_path)    
            
        assert nearly_equal(np.sum([sum(probs_path.values()) for probs_path in probs_paths]), n_depth)        

        # Depth Probability for Each Word
        probs_depths = []
        for word_index, word_idx in enumerate(doc.words):
            probs_depth = doc.get_probs_depth(word_idx)
            probs_depths.append(probs_depth)
            
        assert nearly_equal(np.sum(probs_depths), len(doc.words))
    
        # Likelihood of Doc
        assert len(probs_depths) == len(doc.words)
        logit_doc = 0
        for prob_depths, word_idx in zip(probs_depths, doc.words):
#             prob_topics, prob_word_topics = [], []
            prob_word = 0
            for prob_paths, prob_depth in zip(probs_paths, prob_depths):
                for topic, prob_path in prob_paths.items():
                    prob_topic = prob_path * prob_depth # scalar
                    prob_word_topic = topic.prob_words[word_idx] # scalar
#                     prob_topics.append(prob_topic)
#                     prob_word_topics.append(prob_word_topic)
                    prob_word += prob_topic * prob_word_topic
            logit_word = np.log(prob_word)
            logit_doc += logit_word
        logit_docs += logit_doc
        n_words += len(doc.words)
#         assert nearly_equal(sum(prob_topics), 1.)
        
    perplexity = np.exp(-logit_docs/n_words)
    return perplexity

In [27]:
def init_test(test_docs, topic_root):
    init_doc_topics(docs=test_docs, topic_root=topic_root, train=False)
    init_word_topics(docs=test_docs, train=False)

In [28]:
def sample_test(test_docs, topic_root):
    sample_doc_topics(docs=test_docs, topic_root=topic_root, train=False)
    sample_word_topics(docs=test_docs, train=False)
    assert_sum_cnt_words(topic_root)

### print tree

In [32]:
def print_child_idxs(topic):
    topic_freq_words = [idx_to_word[bow_idxs[bow_index]] for bow_index in np.argsort(topic.cnt_words)[::-1][:10]]
    print('  '*topic.depth, topic.idx, ':', [child.idx for child in topic.children], topic.cnt_doc, np.sum(topic.cnt_words), topic_freq_words)
    for topic in topic.children:
        print_child_idxs(topic)
print_child_idxs(topic_root)

 0 : ['0-1', '0-2', '0-3', '0-4', '0-5', '0-6', '0-7', '0-8', '0-9', '0-10', '0-11', '0-12', '0-13', '0-14', '0-15', '0-16', '0-17', '0-18', '0-19', '0-20', '0-21', '0-22', '0-23', '0-24', '0-25', '0-26', '0-27', '0-28', '0-29', '0-30', '0-31', '0-32', '0-33', '0-34', '0-35', '0-36', '0-37', '0-38', '0-39', '0-40', '0-41', '0-42', '0-43', '0-44', '0-45', '0-46', '0-47', '0-48', '0-49', '0-50', '0-51', '0-52', '0-53', '0-54', '0-55', '0-56', '0-57', '0-58', '0-59', '0-60', '0-61', '0-62', '0-63', '0-64'] 31943 261116.0 ['!', 'nice', 'bought', 'price', 'quality', 'love', "'m", 'made', 'perfect', 'recommend']
   0-1 : ['0-1-4', '0-1-6'] 356 2940.0 ['color', 'picture', 'blue', 'pink', 'green', 'purple', 'black', 'shown', 'received', 'red']
     0-1-4 : [] 222 141.0 ['packaging', 'fingerprints', 'knew', 'weird', 'expecting', '-', 'cut', 'wont', 'spot', 'correctly']
     0-1-6 : [] 134 90.0 ['darker', 'package', 'version', 'file', 'tend', 'tight', 'blue', 'weird', 'important', 'picked']
   0

## run

In [30]:
n_sample = 100
alpha = np.array([10., 5., 1.])
gam = 0.01
eta = 1.
n_depth = 3
verbose = False
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=0, n_doc=n_doc, n_vocab=n_vocab)
docs = [Doc(idx=doc_idx, words=doc_words, bow=doc_bow, n_depth=config.n_depth) for doc_idx, (doc_words, doc_bow) in enumerate(zip(docs_words, docs_bow))]
test_docs = [Doc(idx=doc_idx, words=doc_words, bow=doc_bow, n_depth=config.n_depth) for doc_idx, (doc_words, doc_bow) in enumerate(zip(test_docs_words, test_docs_bow))]

In [31]:
for i in range(n_sample):
    if i == 0:
        init_train(docs, topic_root)
        init_test(test_docs, topic_root)        
        assert_sum_cnt_words(topic_root)
    else:
        sample_train(docs, topic_root)
        sample_test(test_docs, topic_root)                
        assert_sum_cnt_words(topic_root)
        
    perplexity = eval_perplexity(test_docs, topic_root)
    print('Perplexity: %.2f' % perplexity)

0 10000 20000 30000 0 Perplexity: 384.68
0 10000 20000 30000 0 Perplexity: 333.69
0 10000 20000 30000 0 Perplexity: 313.95
0 10000 20000 30000 0 Perplexity: 302.79
0 10000 20000 30000 0 Perplexity: 302.68
0 10000 20000 30000 0 Perplexity: 299.52
0 10000 20000 30000 0 Perplexity: 297.26
0 10000 20000 30000 0 Perplexity: 300.96
0 10000 20000 30000 0 Perplexity: 301.02
0 10000 20000 30000 0 Perplexity: 296.26
0 10000 20000 30000 0 Perplexity: 288.02
0 10000 20000 30000 0 Perplexity: 283.37
0 10000 20000 30000 0 Perplexity: 291.71
0 10000 20000 30000 0 Perplexity: 284.27
0 10000 20000 30000 0 Perplexity: 283.77
0 10000 20000 30000 0 Perplexity: 282.31
0 10000 20000 30000 0 Perplexity: 279.27
0 10000 20000 30000 0 Perplexity: 278.99
0 10000 20000 30000 0 Perplexity: 274.78
0 10000 20000 30000 0 Perplexity: 284.89
0 10000 20000 30000 0 Perplexity: 286.40
0 10000 20000 30000 0 Perplexity: 278.99
0 10000 20000 30000 0 Perplexity: 273.92
0 10000 20000 30000 0 Perplexity: 273.73
0 10000 20000 30

## get coherence score

In [33]:
def add_freq_tokens(topic, topics_freq_tokens):
    topic_freq_tokens = ' '.join([idx_to_word[bow_idxs[bow_index]] for bow_index in np.argsort(topic.cnt_words)[::-1][:10]])
    topics_freq_tokens.append(topic_freq_tokens)
    for child in topic.children:
        add_freq_tokens(child, topics_freq_tokens)

topics_freq_tokens = []
add_freq_tokens(topic_root, topics_freq_tokens)

In [34]:
path_coherence = 'npmi/data/bags/cgs.txt'
with open(path_coherence, 'w') as f:
    f.write('\n'.join(topics_freq_tokens))

## get specialization score

In [37]:
norm_bow = np.sum([instance.bow for instance in instances_train], 0)
norm_vec = norm_bow / np.linalg.norm(norm_bow)

In [44]:
def add_spec(topic, depth_specs=None):
    if depth_specs is None: depth_specs = defaultdict(list)
    topic_vec = topic.prob_words / np.linalg.norm(topic.prob_words)
    topic_spec = 1 - topic_vec.dot(norm_vec)
    depth_specs[topic.depth].append(topic_spec)
    for child in topic.children:
        depth_specs = add_spec(child, depth_specs)
    return depth_specs

depth_specs = add_spec(topic_root)

In [51]:
depth_specs[0]

[0.0749372385946312]

In [50]:
for depth, specs in depth_specs.items():
    spec = np.mean(specs)
    print(depth, spec)

0 0.0749372385946312
1 0.6226005207697187
2 0.49158613401802365
3 nan


  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [52]:
def add_arc(topic, depth_arcs=None):
    if depth_arcs is None: depth_arcs = defaultdict(list)
    topic_vec = topic.prob_words / np.linalg.norm(topic.prob_words)
    topic_arc = np.arccos(topic_vec.dot(norm_vec))
    depth_arcs[topic.depth].append(topic_arc)
    for child in topic.children:
        depth_arcs = add_arc(child, depth_arcs)
    return depth_arcs

depth_arcs = add_arc(topic_root)

In [54]:
for depth, arcs in depth_arcs.items():
    spec = 1 - np.cos(np.mean(arcs))
    print(depth, spec)

0 0.0749372385946312
1 0.6196820668592284
2 0.4913526867429402
