In [1]:
%load_ext autoreload
%autoreload
from IPython.display import clear_output

import os
import pdb
import _pickle as cPickle
import time
import subprocess
import glob

import random
import numpy as np
import pandas as pd
import tensorflow as tf

from collections import defaultdict, Counter
from configure import get_config

#coding: utf-8
import os
import pdb
import time
from collections import defaultdict, Counter

import numpy as np
from scipy.special import gammaln

# load data

In [2]:
nb_name = '0 bags -m rcrp'

In [3]:
config = get_config(nb_name)
np.random.seed(config.seed)
random.seed(config.seed)

In [4]:
config.alp = 1
config.gam = 0.1

In [5]:
instances_train, instances_dev, instances_test, word_to_idx, idx_to_word, bow_idxs = cPickle.load(open(config.path_data,'rb'))

In [6]:
config.n_doc = len(instances_train)
config.n_vocab = len(bow_idxs)
config.n_doc, config.n_vocab

(31943, 1035)

In [7]:
config.alp, config.gam, config.eta

(1, 0.1, 1)

# model

In [8]:
class Topic:
    def __init__(self, idx, sibling_idx, parent, depth, config):
        self.idx = idx
        self.sibling_idx = sibling_idx
        self.parent = parent
        self.children = []
        self.depth = depth
        
        self.cnt_doc_topic = 0
        self.cum_doc_topic = 0
        
        self.cnt_words = np.zeros(config.n_vocab)
        self.cum_words = np.zeros(config.n_vocab)
        
        self.config=config
    
    def sample_child(self, cnt_words=None, init=False, train=True):
        if self.cnt_doc_topic == 0:
            return self
        else:
            probs_child = self.get_probs_child(cnt_words, init=init)
            child_index = np.random.multinomial(1, probs_child).argmax()

            if child_index < len(self.children):
                child = self.children[child_index]
                return child.sample_child(init, train)
            elif child_index == len(self.children):
                return self
            else:
                assert child_index == len(self.children) + 1
                child = self.get_new_child(train)
                return child.sample_child(init, train)
        
    def get_probs_child(self, cnt_words, init=False, train=True):
        if init:
            logits_child = self.get_logits_child_prior()
        else:
            logits_child_prior = self.get_logits_child_prior()
            logits_child_likelihood = self.get_logits_child_likelihood(cnt_words)
            logits_child = logits_child_prior + logits_child_likelihood

        logits_child -= np.max(logits_child)
        s_child = np.exp(logits_child)
        if np.sum(s_child) > 0:
            probs_child = s_child/np.sum(s_child)
            probs_child = probs_child.astype(np.float64)
        else:
            probs_child = np.zeros_like(logits_child, dtype=np.float64)
            probs_child[np.argmax(s_child)] = 1.
            
        return probs_child
        
    def get_logits_child_prior(self):
        s_child_prior = [child.cum_doc_topic for child in self.children]
        s_child_prior += [self.cnt_doc_topic]
        s_child_prior += [self.config.gam**(self.depth+1)]

        logits_child_prior = np.log(s_child_prior)
        return logits_child_prior
    
    def get_logits_child_likelihood(self, cnt_words_doc_topic):
        if len(self.children) > 0:
            children_cnt_words = np.array([child.cum_words for child in self.children]) # (Children) x Vocabulary
            children_cnt_words = np.concatenate([children_cnt_words, self.cnt_words[None, :], np.zeros([1, self.config.n_vocab])], 0) # (Children+Self+NewChildren) x Vocabulary
        else:
            children_cnt_words = np.concatenate([self.cnt_words[None, :], np.zeros([1, self.config.n_vocab])], 0) # (Self+NewChildren) x Vocabulary

        logits_child_likelihood = gammaln(np.sum(children_cnt_words, -1) + self.config.n_vocab*self.config.eta**(self.depth+1)) \
                            - np.sum(gammaln(children_cnt_words + self.config.eta**(self.depth+1)), -1) \
                            - gammaln(np.sum(children_cnt_words + cnt_words_doc_topic, -1) + self.config.n_vocab*self.config.eta**(self.depth+1)) \
                            + np.sum(gammaln(children_cnt_words + cnt_words_doc_topic + self.config.eta**(self.depth+1)), -1)
        return logits_child_likelihood
    
    def get_new_child(self, train=True):
        sibling_idx = max([child.sibling_idx for child in self.children]) + 1 if len(self.children) > 0 else 1
        idx = self.idx + '-' + str(sibling_idx)
        depth = self.depth+1
        child = Topic(idx=idx, sibling_idx=sibling_idx, parent=self, depth=depth, config=self.config)        
        if train: self.children += [child]
        return child    
    
    def increment_cnt(self, cnt_words):
        def increment_cum(topic, cnt_words):
            topic.cum_doc_topic += 1
            topic.cum_words += cnt_words
            if topic.parent is not None: increment_cum(topic.parent, cnt_words=cnt_words)
        
        self.cnt_doc_topic += 1
        self.cnt_words += cnt_words
        increment_cum(self, cnt_words=cnt_words)
            
    def decrement_cnt(self, cnt_words):
        def decrement_cum(topic, cnt_words):
            topic.cum_doc_topic -= 1
            topic.cum_words -= cnt_words
            if topic.parent is not None: decrement_cum(topic.parent, cnt_words=cnt_words)
                
        self.cnt_doc_topic -= 1
        self.cnt_words -= cnt_words
        decrement_cum(self, cnt_words=cnt_words)
            
    def increment_cnt_words(self, word_idx):
        def increment_cum_words(topic, word_idx):
            topic.cum_words[word_idx] += 1
            if topic.parent is not None: increment_cum_words(topic.parent, word_idx=word_idx)
                
        self.cnt_words[word_idx] += 1
        increment_cum_words(self, word_idx=word_idx)
        
    def decrement_cnt_words(self, word_idx):
        def decrement_cum_words(topic, word_idx):
            topic.cum_words[word_idx] -= 1
            if topic.parent is not None: decrement_cum_words(topic.parent, word_idx=word_idx)
                
        self.cnt_words[word_idx] -= 1
        decrement_cum_words(self, word_idx=word_idx)
        
    def increment_cnt_doc(self):
        def increment_cum_doc(topic):
            topic.cum_doc_topic += 1
            if topic.parent is not None: increment_cum_doc(topic.parent)
        
        self.cnt_doc_topic += 1
        increment_cum_doc(self)
        
    def delete_topic(self):
        self.parent.children.remove(self)        

In [9]:
class Doc:
    def __init__(self, idx, words, bow, config):
        self.idx = idx
        self.words = words
        self.cnt_words = bow
        self.config = config
        assert len(words) == np.sum(bow)
        
        self.index_cnt_words = [] # n_indices_topic x n_vocab
        self.word_indices = [] # n_words
        self.topics = [] # n_indices_topic

    def sample_index(self, topic_root, init=False, train=True):
        probs_index = self.get_probs_index(topic_root, init=init)
        index = np.random.multinomial(1, probs_index).argmax()
        return index
    
    def get_probs_index(self, topic_root, init=False):
        if init:
            logits_index = self.get_logits_index_prior()
        else:
            logits_index_prior = self.get_logits_index_prior()
            logits_index_likelihood = self.get_logits_index_likelihood(topic_root)
            logits_index = logits_index_prior + logits_index_likelihood
        
        logits_index -= np.max(logits_index)
        s_index = np.exp(logits_index)
        
        if np.sum(s_index) > 0:
            probs_index = s_index/np.sum(s_index)
            probs_index = probs_index.astype(np.float64)
        else:
            probs_index = np.zeros_like(logits_index, dtype=np.float64)
            probs_index[np.argmax(s_index)] = 1.
        
        return probs_index
                
    def get_logits_index_prior(self):
        if len(self.index_cnt_words) == 0:
            s_index_prior = [self.config.alp]
        else:
            s_index_prior = np.sum(self.index_cnt_words, 1)
            s_index_prior = np.append(s_index_prior, self.config.alp)
        logits_index_prior = np.log(s_index_prior)
        return logits_index_prior
    
    def get_logits_index_likelihood(self, topic_root):
        def get_logit_new_index_likelihood(topic_root):
            def get_all_topics(topic):
                topics = [topic]
                for child in topic.children:
                    topics += get_all_topics(child)
                return topics

            all_topics = get_all_topics(topic_root)

            s_child_prior = [topic.cnt_doc_topic for topic in all_topics]
            s_child_prior += [topic_root.config.gam]
            p_child_prior = s_child_prior / np.sum(s_child_prior)

            s_child_likelihood = np.array([topic.cum_words[word_idx] + topic.config.eta**(topic.depth) for topic in all_topics] + [0.])
            z_child_likelihood = np.array([np.sum(topic.cum_words) + topic.config.n_vocab*(topic.config.eta**(topic.depth)) for topic in all_topics] + [0.])

            p_child_likelihood = s_child_likelihood / z_child_likelihood
            logit_new_index_likelihood = np.log(p_child_prior.dot(p_child_likelihood))

            return logit_new_index_likelihood
    
        s_index_likelihood = np.array([topic.cum_words[word_idx] + topic.config.eta**(topic.depth) for topic in self.topics])
        z_index_likelihood = np.array([np.sum(topic.cum_words) + topic.config.n_vocab*(topic.config.eta**(topic.depth)) for topic in self.topics])

        logits_index_likelihood = np.log(s_index_likelihood/z_index_likelihood)
        logit_new_index_likelihood = get_logit_new_index_likelihood(topic_root)
        logits_index_likelihood = np.append(logits_index_likelihood, logit_new_index_likelihood)

        return logits_index_likelihood

In [10]:
def print_child_idxs(topic):
    topic_freq_words = [idx_to_word[bow_idxs[bow_index]] for bow_index in np.argsort(topic.cum_words)[::-1][:10]]
    print('  '*topic.depth, topic.idx, ':', [child.idx for child in topic.children], topic.cnt_doc_topic, np.sum(topic.cnt_words), topic_freq_words)
    for topic in topic.children:
        print_child_idxs(topic)

In [11]:
def get_topic_cnt(docs):
    topic_cnt_words = {}
    topic_cnt_doc_topic = {}
    for doc in docs:
        for index, topic in enumerate(doc.topics):
            if topic not in topic_cnt_words:
                topic_cnt_words[topic] = np.zeros(config.n_vocab)
                topic_cnt_doc_topic[topic] = 0
            topic_cnt_words[topic] += doc.index_cnt_words[index]
            topic_cnt_doc_topic[topic] += 1
    return topic_cnt_words, topic_cnt_doc_topic

def assert_cnt(topic, topic_cnt_words, topic_cnt_doc_topic):
    assert all(topic_cnt_words[topic] == topic.cnt_words)
    assert topic_cnt_doc_topic[topic] == topic.cnt_doc_topic
    for child in topic.children:
        assert_cnt(child, topic_cnt_words, topic_cnt_doc_topic)

def get_cum_doc_topic(topic):
    cum_doc_topic = np.sum(topic.cnt_doc_topic)
    for child in topic.children:
        cum_doc_topic += get_cum_doc_topic(child)
    return cum_doc_topic

def get_cum_words(topic):
    cum_words = np.zeros_like(topic.cnt_words)
    cum_words += topic.cnt_words
    for child in topic.children:
        cum_words += get_cum_words(child)
    return cum_words

def assert_cum(topic_root, cum_doc_topic, cum_words):
    assert topic_root.cum_doc_topic == cum_doc_topic
    assert all(topic_root.cum_words == cum_words)

# run

In [12]:
def get_docs(instances, config):
    docs_bow = [instance.bow for instance in instances]
    docs_raw = [[[bow_index]*int(doc_bow[bow_index]) for bow_index in np.where(doc_bow > 0)[0]] for doc_bow in docs_bow]
    docs_words = [[idx for idxs in doc for idx in idxs] for doc in docs_raw]
    docs = [Doc(idx=doc_idx, words=doc_words, bow=doc_bow, config=config) for doc_idx, (doc_words, doc_bow) in enumerate(zip(docs_words, docs_bow)) if len(doc_words) > 0]
    return docs

train_docs = get_docs(instances_train, config)[:5000]
topic_root = Topic(idx='0', sibling_idx=0, parent=None, depth=1, config=config)

## init

### init t (assign doc to doc_topic_index)

In [13]:
for train_doc in train_docs:
    for word_index, word_idx in enumerate(train_doc.words):
        new_index = train_doc.sample_index(topic_root, init=True, train=True)
        
        if new_index == len(train_doc.index_cnt_words):
            if len(train_doc.index_cnt_words) == 0:
                train_doc.index_cnt_words = np.zeros([1, train_doc.config.n_vocab])
            else:
                train_doc.index_cnt_words = np.concatenate([train_doc.index_cnt_words, np.zeros([1, train_doc.config.n_vocab])], 0)
            
        train_doc.index_cnt_words[new_index, word_idx] += 1
        train_doc.word_indices.append(new_index)
        
    assert len(train_doc.words) == np.sum(train_doc.index_cnt_words) == len(train_doc.word_indices)
    assert len(train_doc.index_cnt_words) == np.max(train_doc.word_indices) + 1

### init k (assign doc_topic_index to topic

In [14]:
for train_doc in train_docs:
    for index, cnt_words in enumerate(train_doc.index_cnt_words):
        new_topic = topic_root.sample_child(init=True, train=True)
        train_doc.topics.append(new_topic)
        
        new_topic.increment_cnt(cnt_words=cnt_words) # increment count of doc_topic & words
            
    assert len(train_doc.topics) == len(train_doc.index_cnt_words)
assert np.sum([len(train_doc.index_cnt_words) for train_doc in train_docs]) == topic_root.cum_doc_topic == get_cum_doc_topic(topic_root)

In [15]:
topic_cnt_words, topic_cnt_doc_topic = get_topic_cnt(train_docs)
assert_cnt(topic_root, topic_cnt_words, topic_cnt_doc_topic)
cum_doc_topic = get_cum_doc_topic(topic_root)
cum_words = get_cum_words(topic_root)
assert_cum(topic_root, cum_doc_topic, cum_words)

In [16]:
print_child_idxs(topic_root)

   0 : [] 17111 92009.0 ['!', 'sleeve', 'carry', 'bought', 'nice', 'room', 'pockets', 'pocket', 'quality', 'price']


### sample t (assign doc to doc_topic_index)

In [23]:
for train_doc in train_docs:
    for word_index, word_idx in enumerate(train_doc.words):
        # refer index of word
        old_index = train_doc.word_indices[word_index]
        old_topic = train_doc.topics[old_index]
        
        # decrement count of words
        train_doc.index_cnt_words[old_index, word_idx] -= 1
        old_topic.decrement_cnt_words(word_idx=word_idx)
        assert train_doc.index_cnt_words[old_index, word_idx] >= 0
        assert old_topic.cnt_words[word_idx] >= 0
        
        # sample topic_index of word
        new_index = train_doc.sample_index(topic_root, init=False, train=True)
        
        if new_index == len(train_doc.index_cnt_words):
            cnt_words = np.zeros([1, train_doc.config.n_vocab])
            train_doc.index_cnt_words = np.concatenate([train_doc.index_cnt_words, cnt_words], 0)
            new_topic = topic_root.sample_child(cnt_words, init=False, train=True)
            new_topic.increment_cnt_doc()
            train_doc.topics.append(new_topic)
                
        new_topic = train_doc.topics[new_index]
        
        # increment count of words
        train_doc.index_cnt_words[new_index, word_idx] += 1
        new_topic.increment_cnt_words(word_idx=word_idx)
        train_doc.word_indices[word_index] = new_index
        
    assert len(train_doc.words) == np.sum(train_doc.index_cnt_words) == len(train_doc.word_indices)
    assert np.sum(np.sum(train_doc.index_cnt_words, 1) > 0) == len(np.unique(train_doc.word_indices))



In [24]:
topic_cnt_words, topic_cnt_doc_topic = get_topic_cnt(train_docs)
assert_cnt(topic_root, topic_cnt_words, topic_cnt_doc_topic)
cum_doc_topic = get_cum_doc_topic(topic_root)
cum_words = get_cum_words(topic_root)
assert_cum(topic_root, cum_doc_topic, cum_words)

In [25]:
print_child_idxs(topic_root)

   0 : ['0-1'] 17066 91226.0 ['!', 'sleeve', 'carry', 'bought', 'nice', 'room', 'pockets', 'pocket', 'quality', 'price']
     0-1 : [] 45 783.0 ['usb', 'card', 'power', 'ports', 'external', 'slot', 'works', 'drive', 'port', 'needed']


### sample k (assign doc to doc_topic_index)

In [26]:
for train_doc in train_docs:
    for index, cnt_words in enumerate(train_doc.index_cnt_words):
        old_topic = train_doc.topics[index]
        
        old_topic.decrement_cnt(cnt_words)
        assert old_topic.cnt_doc_topic >= 0
        assert np.min(old_topic.cnt_words) >= 0
        
        if old_topic.cnt_doc_topic == 0:
            assert np.sum(old_topic.cnt_words) == 0
            old_topic.delete_topic()
        
        new_topic = topic_root.sample_child(cnt_words, init=False, train=True)
        new_topic.increment_cnt(cnt_words) # increment count of doc_topic & words
        train_doc.topics[index] = new_topic
            
    assert len(train_doc.topics) == len(train_doc.index_cnt_words)
    
assert np.sum([len(train_doc.index_cnt_words) for train_doc in train_docs]) == topic_root.cum_doc_topic == get_cum_doc_topic(topic_root)    

In [27]:
topic_cnt_words, topic_cnt_doc_topic = get_topic_cnt(train_docs)
assert_cnt(topic_root, topic_cnt_words, topic_cnt_doc_topic)
cum_doc_topic = get_cum_doc_topic(topic_root)
cum_words = get_cum_words(topic_root)
assert_cum(topic_root, cum_doc_topic, cum_words)

In [28]:
print_child_idxs(topic_root)

   0 : ['0-1'] 16969 90351.0 ['!', 'sleeve', 'carry', 'bought', 'nice', 'room', 'pockets', 'pocket', 'quality', 'price']
     0-1 : [] 142 1658.0 ['usb', 'card', 'drive', 'power', 'ports', 'works', 'drives', 'slot', 'external', 'work']
