In [93]:
import numpy as np
import time
import pandas as pd
import numpy.random as random
import matplotlib.pyplot as plt
import math
from gensim import corpora
from scipy import optimize
import scipy

In [94]:
#load data
data = pd.read_csv('poliblogs2008.csv')

### Ingest corpus to create documents and vocab

In [95]:
# load corpus
documents = corpora.MmCorpus('corpus.mm')
dictionary = corpora.Dictionary.load('dictionary')

In [96]:
vocab = dictionary.token2id

In [97]:
def init_stm(documents, settings): 
      
    K = settings['dim']['K']
    V = settings['dim']['V']
    A = settings['dim']['A']
    N = settings['dim']['N']
    
    #Random initialization
    mu = np.array([0]*(K-1))[:,None]
    sigma = np.zeros(((K-1),(K-1)))
    diag = np.diagonal(sigma, 0)
    diag.setflags(write=True)
    diag.fill(20)
    beta = random.gamma(.1,1, V*K).reshape(K,V)
    beta = (beta / beta.sum(axis=1)[:,None])
    lambd = np.zeros((N, (K-1)))
    
    #turn beta into a list and assign it for each aspect
    beta = [beta, beta] # FOR A=2
    kappa_initialized = init_kappa(documents, K, V, A, interactions=settings['kappa']['interactions'])
    
    #create model object
    model = {'mu':mu, 'sigma':sigma, 'beta': beta, 'lambda': lambd, 'kappa':kappa_initialized}
    
    return(model)

def init_kappa(documents, K, V, A, interactions): 
    # read in documents and vocab
    flat_documents = [item for sublist in documents for item in sublist]
    m = []

    total_sum = sum(n for _, n in flat_documents)

    for elem in flat_documents: 
        m.append(elem[1] / total_sum)

    m = np.log(m) - np.log(np.mean(m)) #logit of m


    #Defining parameters
    aspectmod = A > 1 # if there is more than one topical content variable
    if(aspectmod):
        interact = interactions # allow for the choice to interact
    else:
        interact = FALSE

    #Create the parameters object
    parLength = K + A * aspectmod + (K*A)*interact

    #create covariates. one element per item in parameter list.
    #generation by type because its conceptually simpler
    if not aspectmod & interact:
        covar = {'k': np.arange(K),
             'a': np.repeat(np.nan, parLength), #why parLength? 
             'type': np.repeat(1, K)}

    if(aspectmod & interact == False):
        covar = {'k': np.append(np.arange(K), np.repeat(np.nan, A)),
                 'a': np.append(np.repeat(np.nan, K), np.arange(A)), 
                 'type': np.append(np.repeat(1, K), np.repeat(2, A))}      
    if(interact):
        covar = {'k': np.append(np.arange(K), np.append(np.repeat(np.nan, A), np.repeat(np.arange(K), A))),
                 'a': np.append(np.repeat(np.nan, K), np.append(np.arange(A), np.repeat(np.arange(A), K))), 
                 'type': np.append(np.repeat(1, K), np.append(np.repeat(2, A),  np.repeat(3,K*A)))}

    kappa = {'out': {'m':m,
                     'params' : np.tile(np.repeat(0,V), (parLength, 1)),
                     'covar' : covar
                     #'kappasum':, why rolling sum?
                    }
            }

    return(kappa['out'])

### Make Topic Matrix

In [98]:
prevalence = 'blog'
content = 'rating'

In [99]:
def makeTopMatrix(x, data=None):
    return(data.loc[:,x])

In [100]:
xmat = makeTopMatrix(content, data)

In [101]:
yvar = makeTopMatrix(content, data)
yvar = yvar.astype('category')
yvarlevels = set(yvar)
betaindex = yvar.cat.codes

In [102]:
A = len(set(betaindex))

# Setting control variables

In [103]:

K = 10 #settings.dim
V = len(dictionary) #settings.dim
N = len(documents) #settings.dim

interactions = True #settings.kappa
verbose = True

init_type = "Random" #settings.init
ngroups = 1 #settings.ngroups
max_em_its = 15 #settings.convergence
emtol = 1e-5 #settings.convergence

#gamma_prior=("Pooled","L1") # settings.gamma.prior
#sigma_prior=0 #settings.sigma.prior
#kappa_prior=("L1","Jeffreys") # settings.kappa.prior

#Initialize parameters

settings = {
    'dim':{
        'K': K, #number of topics
        'V' :V, #number of words
        'A' : A, #dimension of topical content
        'N' : N,
        'wcounts':V
    },
    'kappa':{
        'interactions':True,
        'fixedintercept': True,
        'contrats': False,
        'mstep': {'tol':0.01, 'maxit':5}},
    'tau':{
        'mode': np.nan,
        'tol': 1e-5,
        'enet':1,
        'nlambda':250,
        'lambda.min.ratio':.001,
        'ic.k':2,
        'maxit':1e4},
    'init':{
        'mode':init_type, 
        'nits':20,
        'burnin':25,
        'alpha':50/K,
        'eta':.01,
        's':.05,
        'p':3000},
    'convergence':{
        'max.em.its':max_em_its,
        'em.converge.thresh':emtol,
        'allow.neg.change':True,},
    'covariates':{
        'X':xmat,
        'betaindex':betaindex,
        'yvarlevels':yvarlevels,
        'formula': prevalence,},
    'gamma':{
        'mode':np.nan,
        'prior':np.nan,
        'enet':1, 
        'ic.k':2,
        'maxits':1000,},
    'sigma':{
        #'prior':sigma_prior,
        'ngroups':ngroups,},
}

In [104]:
def stm_control(documents, vocab, settings, model=None):
    
    ##########
    #Step 1: Initialize Parameters
    ##########
    
    #ngroups = settings$ngroups
    
    if model == None:
        print('Call init_stm()')
        model = init_stm(documents, settings) #initialize
    else: 
        model = model
        
    # unpack initialized model
    
    mu = model['mu']
    sigma = model['sigma']
    lambd = model['lambda'] 
    beta = {'beta': model['beta'],
            'kappa': model['kappa']}
    
    convergence = None
    
    #discard the old object
    del model
    
    betaindex = settings['covariates']['betaindex']
    
    #Pull out some book keeping elements
    ntokens = settings['dim']['wcounts']
    betaindex = settings['covariates']['betaindex']
    stopits = True
    
    ############
    #Step 2: Run EM
    ############
    
    while stopits == True:
        #####
        # Non-Blocked Updates
        #####
        t1 = time.process_time()

        #run the model
        suffstats = e_step(documents, mu, sigma, lambd, beta)
        
        print("Completed E-Step ({} seconds). \n".format(math.floor((time.process_time()-t1))))


        #unpack variables 
        
        #t1 = process_time()
        #sigma_ss = suffstats['sigma']
        #lambd <- suffstats['lambd']
        #beta_ss <- suffstats['beta']
        #bound_ss <- suffstats['bound']
        #do the m-step
        #mu = opt_mu(lambd=lambd,
        #            mode=settings['gamma']['mode'],
        #            covar=settings['covariates']['X'],
        #            enet=settings['gamma']['enet'],ic.k=settings$gamma$ic.k,
        #            maxits=settings['gamma']['maxits'])
        #sigma = opt_sigma(nu=sigma_ss, lambd=lambd,
        #                     mu=mu['mu'], sigprior=settings['sigma']['prior'])
        #beta = opt_beta(beta_ss, beta['kappa'], settings)

In [105]:
stm_control(documents, vocab, settings, model=None)

Call init_stm()


NameError: name 'e_step' is not defined

From **A Model of Text for Experimentation in the Social Sciences (Roberts et al.)**:
- $\beta_k$ is a V-dimensional probability mass function that controls the frequency according to which terms are generated from that topic.

From **The Structural Topic Model and Applied Social Science (Roberts et al.)**:
- 'For the nonconjugate logistic normal variables in the E-step we use a Laplace approximation.'

In [106]:
def e_step(documents, mu, sigma, lambd, beta):
    #quickly define useful constants
    V = beta['beta'][0].shape[1] # ncol
    K = beta['beta'][0].shape[0] # nrow
    N = len(documents)
    A = len(beta['beta'])
    
    # 1) Initialize Sufficient Statistics 
    sigma_ss = np.zeros(((K-1),(K-1)))
    if A == 2: 
        beta_ss = [np.zeros((K,V)), np.zeros((K,V))]
    else:
        print('Error: Only two metadata columns allowed.')
    bound = np.repeat(0,N)
    #lambd = np.repeat(0,N)
    
    # 2) Precalculate common components
    sigobj = np.linalg.cholesky(sigma)
    sigmaentropy = np.sum(np.log(np.diag(sigobj)))
    siginv = np.linalg.inv(sigobj).T*np.linalg.inv(sigobj)
    
    # 3) Document Scheduling
    # For right now we are just doing everything in serial.
    # the challenge with multicore is efficient scheduling while
    # maintaining a small dimension for the sufficient statistics.
    for i in range(N): 
        # update components
        doc = documents[i]
        words = [x for x,y in doc]
        doc_ct = [y for x,y in doc]
        aspect = betaindex[i]
        init = lambd[i]
        #if update_mu: mu_i = mu[i]
        beta_i = beta['beta'][aspect][:,[words]]
        #doc_ct = np.array([y for x,y in doc]) #count of words in document
        Ndoc = np.sum(doc_ct)

        # initial values
        eta = init
        mu = mu.flatten()
        eta_long = np.insert(eta,-1,0)
        beta = beta_i.reshape(10,beta_i.shape[2])
        neta = len(eta)
        theta = softmax(eta_long)
        phi = softmax_weights(eta_long, beta)
        
        #infer the document with the logistic normal distribution
        doc_results = logisticnormalcpp(
            eta=eta,
            mu=mu,
            siginv=siginv,
            beta_i=beta, 
            doc_ct=doc_ct,
            Ndoc=Ndoc)

        # update sufficient statistics

# Let's start by assuming its one beta and we may have arbitrarily subset the number of docs.
```python
> def estep(documents=documents, beta_index=betaindex,
          update_mu=True,
          beta['beta'], lambd, mu, sigma,
          verbose):
```

In [119]:

model = init_stm(documents, settings) #initialize


# unpack initialized model

mu = model['mu']
sigma = model['sigma']
lambd = model['lambda'] 
beta = {'beta': model['beta'],
        'kappa': model['kappa']}

convergence = None

#discard the old object
del model

betaindex = settings['covariates']['betaindex']

#Pull out some book keeping elements
ntokens = settings['dim']['wcounts']
betaindex = settings['covariates']['betaindex']
stopits = True
    
#set global parameter for sigma
sigobj = np.linalg.cholesky(sigma)
sigmaentropy = np.sum(np.log(np.diag(sigobj)))
siginv = np.linalg.inv(sigobj).T*np.linalg.inv(sigobj)

#set parameters for one document (i)
i = 0
doc = documents[i]
words = [x for x,y in doc]
aspect = betaindex[i]
init = lambd[i]
beta_i = beta['beta'][aspect][:,[words]]
eta=init
mu_i=mu
siginv = siginv
doc=doc

#set document specs
doc_ct = np.array([y for x,y in doc]) #count of words in document
Ndoc = np.sum(doc_ct)

# initial values
eta_long = np.insert(eta,-1,0)
beta = beta_i.reshape(10,beta_i.shape[2])
neta = len(eta)
theta = softmax(eta_long)
phi = softmax_weights(eta_long, beta)
mu = mu_i.flatten()

In [70]:
def logisticnormalcpp(
    eta,
    mu_i,
    siginv,
    beta_i,
    doc_ct,
    Ndoc):
    
    # initial values
    eta_long = np.insert(eta,-1,0)
    beta = beta_i.reshape(10,beta_i.shape[2])
    neta = len(eta)
    theta = softmax(eta_long)
    phi = softmax_weights(eta_long, beta)
    
    optim_out = optimize.fmin_bfgs(lhood,
                                   x0=eta,
                                   args=(beta_i, doc_ct, mu, siginv, Ndoc, eta_long),
                                   fprime=grad)

In [87]:
logisticnormalcpp(eta=eta,
                  mu_i=mu,
                  beta_i=beta_i,
                  doc_ct=doc_ct, siginv=siginv,
                  Ndoc=Ndoc)

ValueError: matmul: Input operand 1 does not have enough dimensions (has 0, gufunc core with signature (n?,k),(k,m?)->(n?,m?) requires 1)

# Optimize

In [108]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

In [109]:
def softmax_weights(x, weight):
    """Compute softmax values for each sets of scores in x."""
    e_x = weight*np.exp(x - np.max(x))[:,None]
    return e_x / e_x.sum(axis=0)

## E-Step

### Likelihood Function 

$f(\hat{\eta_{d}}) \propto  - \frac{1}{2} (\eta_d-\mu_d)^T \sum^{-1}(\eta_d-\mu_d)+\big(\sum_v c_{d,v} log\sum_k \beta_{k,v} e^{\eta_{d,k}}- W_d log \sum_k e^{\eta_{d,k}}\big)$

In [120]:
def lhood(eta, beta, doc_ct, mu, siginv, Ndoc):
    
    #formula
    part1 = .5*(eta-mu)@siginv@(eta-mu)
    part2 = np.sum(doc_ct * (eta_long.max() + np.log(np.exp(eta_long - eta_long.max())@beta)))-np.sum(doc_ct)*scipy.special.logsumexp(eta)
    
    # Move to trash eventually
    #part1 = np.log(expeta@beta@doc_ct) - Ndoc * np.log(np.sum(expeta)) #alternative based on c++ code
    #part1 = np.sum(doc_ct)*np.log(np.sum(expeta@beta)) - Ndoc * np.log(np.sum(expeta)) #alternative (based on paper)
    #part2 = - .5*(eta-mu)@siginv@(eta-mu)
    #out = .5*(eta-mu)@siginv@(eta-mu) + (np.sum(doc_ct)*np.log(np.sum(expeta@beta)) - Ndoc * np.log(np.sum(expeta)))
    
    out = part1 - part2
    
    return -out

In [121]:
lhood(eta, beta, doc_ct, mu, siginv, Ndoc)

-1859.5082847176204

### Gradient of the Likelihood

$\nabla f(\eta_d)_k = (\sum c_{d,v} \langle \phi_{d,v,k} \rangle) - W_d\theta_{d,k}-\big(\sum^{-1}(\eta_d-\mu_d)\big)_k$

$\theta_{d,k} = \frac{exp(\eta)}{\sum exp(\eta)}$

$\langle\phi_{d,k,v}\rangle = \frac{exp(\eta_{d,k}) \beta_{d,v,k}}{\sum_k exp(\eta_{d,k}) \beta_{d,v,k}} $

In [122]:
def grad(eta, beta, doc_ct, mu, siginv, Ndoc):
    
    #formula
    part1 = np.delete(np.sum(phi * doc_ct,axis=1) - np.sum(doc_ct)*theta, neta)
    part2 = siginv@(eta-mu)

    return part2-part1

In [123]:
grad(eta,beta, doc_ct, mu, siginv, Ndoc)

array([ 4.03875613,  3.42609557, -4.22852552,  6.04017365,  3.03373851,
       -6.22652965,  3.05837804, -2.50537515,  2.87384212])

# We minimize the negative log-likelihood which is equivalent to maximizing the likelihood

In [126]:
from scipy.optimize import check_grad

optimize.check_grad(lhood, grad, eta, beta, doc_ct, mu, siginv, Ndoc)

59.723380536159084

---

# Appendix

### Optimize multiple arguments at the same time using scipy requires de-vectorization of the arguments

In [None]:
def toVector(eta,beta_i, doc_ct, mu, siginv, Ndoc):
    assert eta.shape == (9,)
    assert beta_i.shape == (10, 1, 127)
    assert doc_ct.shape == (127,)
    assert mu.shape == (9,1)
    assert siginv.shape == (9,9)
    assert Ndoc.shape == ()
    return np.hstack([eta.flatten(),beta_i.flatten(), doc_ct.flatten(), mu.flatten(), siginv.flatten(),Ndoc.flatten()])

vec = toVector(eta,beta_i, doc_ct, mu, siginv, Ndoc)

def toObject(vec):
    eta = vec[0:9].reshape(9,)
    beta_i = vec[9:9+1270].reshape(10,1,127)
    doc_ct = vec[1279:1279+127].reshape(127,)
    mu = vec[1279+127:1279+127+9].reshape(9,1)
    siginv = vec[1279+136:1279+136+9*9].reshape(9,9)
    Ndoc = vec[1279+136+9*9:].reshape(())
    return eta,beta_i, doc_ct, mu, siginv, Ndoc

eta,beta_i, doc_ct, mu, siginv, Ndoc = toObject(vec)

def doOptimization(lhood, eta,beta_i, doc_ct, mu, siginv, Ndoc):
    def objective(vec): 
        eta,beta_i, doc_ct, mu, siginv, Ndoc = toObject(vec)
        return lhood(eta,beta_i, doc_ct, mu, siginv, Ndoc)
    def gradient(vec):
        eta,beta_i, doc_ct, mu, siginv, Ndoc = toObject(vec)
        return grad(eta,beta_i, doc_ct, mu, siginv, Ndoc)

    result = optimize.minimize(objective, toVector(eta,beta_i, doc_ct, mu, siginv, Ndoc))
    result.x = toObject(result.x) 
    return result

result = doOptimization(lhood, eta,beta_i, doc_ct, mu, siginv, Ndoc)