In [16]:
%matplotlib inline
import os
import sys

from collections import OrderedDict
from copy import deepcopy
from time import time

import matplotlib.pyplot as plt
import numpy as np
import scipy.special as sc

import pymc3 as pm
import seaborn as sns
import theano
import theano.tensor as tt

from pymc3 import Dirichlet, Poisson, Gamma, Normal, Bernoulli
from pymc3 import math as pmmath
# from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from theano import shared
from theano.sandbox.rng_mrg import MRG_RandomStreams

%env THEANO_FLAGS=device=cpu,floatX=float64


from data_prep import prepare_sparse_matrix

env: THEANO_FLAGS=device=cpu,floatX=float64


In [8]:
n_words = 3000
n_samples = 10000

tf_vectorizer, docs_tr = prepare_sparse_matrix(n_samples, n_words)

In [9]:
feature_names = tf_vectorizer.get_feature_names()

In [10]:
n_tokens = np.sum(docs_tr[docs_tr.nonzero()])
n_tokens

808287

In [11]:
def logp_prob(beta, theta, h):
    """Returns the log-likelihood function for given documents.

    K : number of topics in the model
    V : number of words (size of vocabulary)
    D : number of documents (in a mini-batch)

    Parameters
    ----------
    beta : tensor (K x V)
        Word distributions.
    theta : tensor (D x K)
        Topic distributions for documents.
    """

    """
    \log p(d | theta, beta) = \sum_w \log p(w | theta, beta)
                            = \sum_w \log Poisson(w | theta @ beta) 
                            = \sum_w \log Poisson(w | \sum_k theta_k * beta_k)
                            = \sum_w \log Poisson(w | \sum_k \exp( \log theta_k + \log beta_k ))
                            = \sum_w - \sum_k \exp( \log theta_k + \log beta_k ) + w * \log \sum_k \exp( \log theta_k + \log beta_k ) - \log gamma(w + 1)
    """

    def ll_docs_f(docs):
        dixs, vixs = docs.nonzero()
        vfreqs = docs[dixs, vixs]

        ll_docs = (
            ((theta * h)[dixs] + beta.T[vixs]).sum(1) + 
            vfreqs * pmmath.logsumexp(tt.log((theta * h)[dixs]) + tt.log(beta.T[vixs]), axis=1).ravel() - \
            pm.distributions.special.gammaln(vfreqs + 1)
        )
        # Per-word log-likelihood times num of tokens in the whole dataset
        return tt.sum(ll_docs) / (tt.sum(vfreqs) + 1e-9) * n_tokens

    return ll_docs_f

In [17]:
n_topics = 20
n_subtopics = n_topics // 2
minibatch_size = 128

avg_len = docs_tr.sum(1).mean()

doc_t_minibatch = pm.Minibatch(docs_tr.toarray(), minibatch_size)
doc_t = shared(docs_tr.toarray()[:minibatch_size])

e0 = c0 = 1.
f0 = .01
pn = .5

with pm.Model() as model:
    beta = Dirichlet(
        "beta",
        a=pm.floatX((1.0 / n_topics) * np.ones((n_topics, n_words))),
        shape=(n_topics, n_words),
    )

    gamma0 = Gamma(
        "gamma0", 
        alpha=pm.floatX(e0 * np.ones((1, n_topics))),
        beta=pm.floatX(f0 * np.ones((1, n_topics))),
        shape=(minibatch_size, n_topics)
    )
    
    gamma = Gamma(
        "gamma",
        alpha=gamma0, 
        beta=1 / 0.001,
        shape=(minibatch_size, n_topics)
    )

    theta = Gamma(
        "theta",
        alpha=gamma,
        beta=pm.floatX((pn / (1. - pn)) * np.ones((minibatch_size, n_topics))),
        shape=(minibatch_size, n_topics),
        total_size=n_samples,
    )
    
    w = Normal(
        'w',
        mu=0.,
        sigma=pm.floatX(10. * np.ones((minibatch_size, n_subtopics))),
        shape=(minibatch_size, n_subtopics)
    )
    
    h0 = Bernoulli(
        'h0',
        p=0.5,
        shape=(n_subtopics, n_topics),
    )
    
    h = Bernoulli(
        'h',
        logit_p=w @ h0,
        shape=(minibatch_size, n_topics)
    )
    
    # Note, that we defined likelihood with scaling, so here we need no additional `total_size` kwarg
    doc = pm.DensityDist("doc", logp_prob(beta, theta, h), observed=doc_t)

    step = pm.Metropolis()
    trace = pm.sample(1000, step)
    

  self.shared = theano.shared(data[in_memory_slc])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
  rval = inputs[0].__getitem__(inputs[1:])
Multiprocess sampling (4 chains in 4 jobs)
CompoundStep
>Metropolis: [h]
>Metropolis: [h0]
>Metropolis: [w]
>Metropolis: [theta]
>Metropolis: [gamma]
>Metropolis: [gamma0]
>Metropolis: [beta]
INFO (theano.gof.compilelock): Waiting for existing lock by process '3751047' (I am process '3760460')
INFO (theano.gof.compilelock): To manually release the lock, delete /home/liutianc/.theano/compiledir_Linux-5.4--generic-x86_64-with-debian-bullseye-sid-x86_64-3.7.7-64/lock_dir
INFO (theano.gof.compilelock): Waiting for existing lock by process '3751047' (I am process '3760460')
INFO (theano.gof.compilelock): To manually release the lock, delete /home/liutianc/.theano/compiledir_Linux-5.4--generic-x86_64-with-debian-bullseye-sid-x86_64-3.7.7-64/lock_dir
INFO (theano.gof.compilelock): Waiting for existing lock by proces

# Evaluate Top words in each topic basd on the posterior mean.

In [18]:
def print_top_words(beta, feature_names, n_top_words=10):
    for i in range(len(beta)):
        print(
            ("Topic #%d: " % i)
            + " , ".join([feature_names[j] for j in beta[i].argsort()[: -n_top_words - 1 : -1]])
        )

In [19]:
with model:
    ppc = pm.sample_posterior_predictive(
        trace, var_names=["theta", "beta"], samples=1000, random_seed=1
    )

  "samples parameter is smaller than nchains times ndraws, some draws "
100%|██████████| 1000/1000 [00:00<00:00, 1045.20it/s]


In [20]:
topic_dist = ppc['beta']
topic_dist_mean = topic_dist.mean(0)
topic_dist_mean.shape

(20, 3000)

In [21]:
print_top_words(topic_dist_mean, feature_names)

Topic #0: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #1: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #2: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #3: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #4: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #5: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #6: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #7: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , accent , acceptable
Topic #8: abandoned , abilities , ability , able , absolute , absolutely , abuse , abysmal , acc

In [22]:
with model:
    priorpc = pm.sample_prior_predictive(
        var_names=["n", "beta", "theta"], samples=50, random_seed=1
    )

KeyError: 'n'