In [63]:
import os
import pandas as pd
import numpy as np

HOME_PATH = os.path.expanduser('~')
DATA_PATH = HOME_PATH + '/Projects/ssmsi/pickles/corpora/a-to-h/'
OUT_PATH = HOME_PATH + '/Projects/ssmsi/pickles/corpora/synthetic/'

vocab_pickle = pd.read_pickle(DATA_PATH + 'a-to-h_vocab.pkl')
vocab = vocab_pickle.tolist()

In [64]:
def mean_parameter(arr):
    return np.exp(arr) / np.sum(np.exp(arr))

In [65]:
K = 10
V = len(vocab)
xi = 1000
T = 50

alphas = np.zeros((T, K))
betas = np.zeros((T, K, V))
var_init = 2
var_basic = 0.5

beta_0 = np.zeros((K, V))
for k, beta_0_k in enumerate(beta_0):
    beta_0[k] = np.random.normal(beta_0_k, var_init)

betas[0] = beta_0
for t in xrange (1, T):
    betas[t] = np.random.normal(betas[t-1], var_basic)
    
alphas[0] = np.random.normal(alphas[0], var_init)
for t in xrange (1, T):
    alphas[t] = np.random.normal(alphas[t-1], var_basic)

### Option: Dynamic Generative Process

In [66]:
corpus_dyn = {}
for d in range(T):
    corpus_dyn[d] = {}
    N = np.random.poisson(xi)
    for w in range(N):
        eta = mean_parameter(alphas[d])
        topic_distrib = np.random.multinomial(1, eta)
        z = np.where(topic_distrib == 1)[0][0]
        beta_mp = mean_parameter(betas[d][z])
        word_distrib = np.random.multinomial(1, beta_mp)
        w = np.where(word_distrib == 1)[0][0]
        w_key = vocab[w]
        if w_key not in corpus_dyn[d]:
            corpus_dyn[d][w_key] = 0
        corpus_dyn[d][w_key] += 1

### Dump the corpus

In [68]:
import pickle
with open(OUT_PATH + 'd-corpus_' + str(T) + '.pkl', 'wb') as f_out:
    pickle.dump(corpus_dyn, f_out)

### Option: Static Generative Process

In [7]:
theta = np.random.dirichlet(alpha)
generated_corpus = {}
for i in range(D):
    generated_corpus[i] = {}
    N = np.random.poisson(xi)
    for j in range(N):
        topic_distrib = np.random.multinomial(1, theta)
        z = np.where(topic_distrib == 1)[0][0]
        word_distrib = np.random.multinomial(1, beta[z])
        w = np.where(word_distrib == 1)[0][0]
        w_key = vocab[w]
        if w_key not in generated_corpus[i]:
            generated_corpus[i][w_key] = 0
        generated_corpus[i][w_key] += 1