In [2]:
import os
import re
import _pickle as cPickle
from collections import OrderedDict, defaultdict, Counter

import matplotlib
%matplotlib inline
import matplotlib.pyplot as plt

import numpy as np
import pandas as pd
import tensorflow as tf
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk import word_tokenize

from data_structure import Instance

# configure

In [37]:
def del_all_flags(FLAGS):
    flags_dict = FLAGS._flags()    
    keys_list = [keys for keys in flags_dict]    
    for keys in keys_list:
        FLAGS.__delattr__(keys)

del_all_flags(tf.flags.FLAGS)

flags = tf.app.flags

flags.DEFINE_string('output_path', 'data/synthetic/instances.pkl', 'path of output data')

flags.DEFINE_integer('n_vocab', 9, 'size of vocab')
flags.DEFINE_integer('n_doc', 100, 'size of vocab')

flags.DEFINE_string('f', '', 'kernel')

config = flags.FLAGS

# sample docs

In [4]:
tree_idxs = {0:[1, 2, 3], 
              1:[10, 11, 12], 2:[20, 21, 22], 3:[30, 31, 32]}

In [35]:
tree_topic_bow_raw = {
                0: np.ones(9, dtype=np.float32),
                1: np.concatenate([np.ones(3), np.zeros(3), np.zeros(3)]).astype(np.float32),
                2: np.concatenate([np.zeros(3), np.ones(3), np.zeros(3)]).astype(np.float32),
                3: np.concatenate([np.zeros(3), np.zeros(3), np.ones(3)]).astype(np.float32),
                10: np.concatenate([np.array([1, 0, 0]), np.zeros(3), np.zeros(3)]).astype(np.float32),
                11: np.concatenate([np.array([0, 1, 0]), np.zeros(3), np.zeros(3)]).astype(np.float32),
                12: np.concatenate([np.array([0, 0, 1]), np.zeros(3), np.zeros(3)]).astype(np.float32),
                20: np.concatenate([np.zeros(3), np.array([1, 0, 0]), np.zeros(3)]).astype(np.float32),
                21: np.concatenate([np.zeros(3), np.array([0, 1, 0]), np.zeros(3)]).astype(np.float32),
                22: np.concatenate([np.zeros(3), np.array([0, 0, 1]), np.zeros(3)]).astype(np.float32),
                30: np.concatenate([np.zeros(3), np.zeros(3), np.array([1, 0, 0])]).astype(np.float32),
                31: np.concatenate([np.zeros(3), np.zeros(3), np.array([0, 1, 0])]).astype(np.float32),
                32: np.concatenate([np.zeros(3), np.zeros(3), np.array([0, 0, 1])]).astype(np.float32),
}

tree_topic_bow = {topic_idx: topic_bow_raw/np.sum(topic_bow_raw) for topic_idx, topic_bow_raw in tree_topic_bow_raw.items()}

In [36]:
tree_topic_bow

{0: array([0.11111111, 0.11111111, 0.11111111, 0.11111111, 0.11111111,
        0.11111111, 0.11111111, 0.11111111, 0.11111111], dtype=float32),
 1: array([0.33333334, 0.33333334, 0.33333334, 0.        , 0.        ,
        0.        , 0.        , 0.        , 0.        ], dtype=float32),
 2: array([0.        , 0.        , 0.        , 0.33333334, 0.33333334,
        0.33333334, 0.        , 0.        , 0.        ], dtype=float32),
 3: array([0.        , 0.        , 0.        , 0.        , 0.        ,
        0.        , 0.33333334, 0.33333334, 0.33333334], dtype=float32),
 10: array([1., 0., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 11: array([0., 1., 0., 0., 0., 0., 0., 0., 0.], dtype=float32),
 12: array([0., 0., 1., 0., 0., 0., 0., 0., 0.], dtype=float32),
 20: array([0., 0., 0., 1., 0., 0., 0., 0., 0.], dtype=float32),
 21: array([0., 0., 0., 0., 1., 0., 0., 0., 0.], dtype=float32),
 22: array([0., 0., 0., 0., 0., 1., 0., 0., 0.], dtype=float32),
 30: array([0., 0., 0., 0., 0., 0.

In [38]:
def hierarchical_sbp(tree_sticks_topic, tree_sticks_branch):
    tree_prob_topic = {}
    rest_topics = {}

    # calculate topic probability and save
    stick_topic = tree_sticks_topic[0]
    tree_prob_topic[0] = stick_topic
    rest_topics[0] = 1.-stick_topic
    for parent_idx, child_idxs in tree_idxs.items():
        rest_topic = rest_topics[parent_idx]
        rest_branch = 1.
        for child_idx in child_idxs:
            # calculate topic probability
            if child_idx == child_idxs[-1]: # last child
                prob_branch = rest_branch # phi
            else:
                stick_branch = tree_sticks_branch[child_idx] # psi
                prob_branch = stick_branch * rest_branch # phi

            if not child_idx in tree_idxs: # leaf childs
                prob_topic = prob_branch * rest_topic # pi
            else:
                stick_topic = tree_sticks_topic[child_idx] # upsilon
                prob_topic = stick_topic * prob_branch * rest_topic # pi

            # save topic probability and update rest stick length
            tree_prob_topic[child_idx] = prob_topic
            rest_branch = (1.- stick_branch) * rest_branch
            rest_topics[child_idx] = (1.-stick_topic)*prob_branch * rest_topic
            
    return tree_prob_topic

In [None]:
np.random.beta(10, 15, 5)

In [None]:
for i_doc in range(config.n_doc):
    

# write out

In [342]:
def prepare_instances(data_df, word_to_idx, bow_list, item_idx_summaries=None):
    instances = []
    bows = bow_list.toarray()
    assert len(bows) == len(data_df)
    for bow, (idx_doc, doc) in zip(bows, data_df.iterrows()):
        instance = Instance()
        instance.idx = idx_doc
        instance.review_idx = doc.review_idx
        instance.item_idx = doc.item_idx
        instance.score = doc.score
        doc_token_idxs = []
        for sent_tokens in doc.tokens:
            sent_token_idxs = [word_to_idx[token] if token in word_to_idx else word_to_idx[UNK] for token in sent_tokens]
            doc_token_idxs.append(sent_token_idxs)            
        instance.token_idxs = doc_token_idxs
        instance.doc_l = doc.doc_l
        instance.max_sent_l = doc.max_sent_l
        instance.bow = bow
        if item_idx_summaries:
            instance.summaries = item_idx_summaries[instance.item_idx]
        instances.append(instance)
    return instances

In [343]:
instances_train = prepare_instances(train_df, word_to_idx, train_bow_list)
instances_valid = prepare_instances(valid_df, word_to_idx, valid_bow_list)
instances_test = prepare_instances(test_df, word_to_idx, test_bow_list, item_idx_summaries)

In [344]:
print('saving preprocessed instances...')
cPickle.dump((instances_train, instances_valid, instances_test, word_to_idx, idx_to_word, bow_idxs),open(config.output_path,'wb'))

saving preprocessed instances...
