In [11]:
import os
import pandas as pd
import numpy as np

HOME_PATH = os.path.expanduser('~')
DATA_PATH = HOME_PATH + '/Projects/ssmsi/pickles/corpora/a-to-h/'
OUT_PATH = HOME_PATH + '/Projects/ssmsi/pickles/corpora/synthetic/'

In [2]:
vocab_pickle = pd.read_pickle(DATA_PATH + 'a-to-h_vocab.pickle')
vocab = vocab_pickle.tolist()

In [3]:
K = 10
V = len(vocab)
xi = 500
D = 500

alpha = np.random.rand(K)
beta = np.zeros((K, V))

for r, row in enumerate(beta):
    beta[r] = np.random.dirichlet(np.full(V, 1./V))

In [4]:
def mean_parameter(arr):
    return np.exp(arr) / np.sum(np.exp(arr))

### Option: Static Generative Process

In [5]:
theta = np.random.dirichlet(alpha)
generated_corpus = {}
for i in range(D):
    generated_corpus[i] = {}
    N = np.random.poisson(xi)
    for j in range(N):
        topic_distrib = np.random.multinomial(1, theta)
        z = np.where(topic_distrib == 1)[0][0]
        word_distrib = np.random.multinomial(1, beta[z])
        w = np.where(word_distrib == 1)[0][0]
        w_key = vocab[w]
        if w_key not in generated_corpus[i]:
            generated_corpus[i][w_key] = 0
        generated_corpus[i][w_key] += 1
        
corpus_panda = pd.Series(generated_corpus)
corpus_panda.to_pickle(OUT_PATH + 'naive_corpus.pickle')

### Option: Dynamic Generative Process

In [5]:
seq_generated_corpus = {}
for i in range(D):
    seq_generated_corpus[i] = {}
    N = np.random.poisson(xi)
    eta = np.random.normal(alpha)
    for j in range(N):
        mp_eta = mean_parameter(eta)
        topic_distrib = np.random.multinomial(1, mp_eta)
        z = np.where(topic_distrib == 1)[0][0]
        mp_beta = mean_parameter(beta[z])
        word_distrib = np.random.multinomial(1, mp_beta)
        w = np.where(word_distrib == 1)[0][0]
        w_key = vocab[w]
        if w_key not in seq_generated_corpus[i]:
            seq_generated_corpus[i][w_key] = 0
        seq_generated_corpus[i][w_key] += 1
    beta = np.random.normal(beta)
    alpha = np.random.normal(alpha)
    
# seq_corpus_panda = pd.Series(seq_generated_corpus)
# seq_corpus_panda.to_pickle(OUT_PATH + 'seq_corpus.pickle')

In [20]:
# seq_corpus_panda = pd.Series(seq_generated_corpus)
# seq_corpus_panda.to_pickle(OUT_PATH + 'seq-corpus_500docs.pickle', protocol=2)
import pickle
with open(OUT_PATH + 'seq-corpus_500docs.pickle', 'wb') as out_file:
    pickle.dump(seq_generated_corpus, out_file, protocol=4)
