In [20]:
import numpy as np
import time
import pandas as pd
import numpy.random as random
import matplotlib.pyplot as plt
import math
from gensim import corpora

In [21]:
#load data
data = pd.read_csv('poliblogs2008.csv')

### Ingest corpus to create documents and vocab

In [22]:
# load corpus
documents = corpora.MmCorpus('corpus.mm')
dictionary = corpora.Dictionary.load('dictionary')

In [24]:
vocab = dictionary.token2id

In [25]:
def init_stm(documents, settings): 
      
    K = settings['dim']['K']
    V = settings['dim']['V']
    A = settings['dim']['A']
    N = settings['dim']['N']
    
    #Random initialization
    mu = np.array([0]*(K-1))[:,None]
    sigma = np.zeros(((K-1),(K-1)))
    diag = np.diagonal(sigma, 0)
    diag.setflags(write=True)
    diag.fill(20)
    beta = random.gamma(.1,1, V*K).reshape(K,V)
    beta = (beta / beta.sum(axis=1)[:,None])
    lambd = np.zeros((N, (K-1)))
    
    #turn beta into a list and assign it for each aspect
    beta = [beta, beta] # FOR A=2
    kappa_initialized = init_kappa(documents, K, V, A, interactions=settings['kappa']['interactions'])
    
    #create model object
    model = {'mu':mu, 'sigma':sigma, 'beta': beta, 'lambda': lambd, 'kappa':kappa_initialized}
    
    return(model)

def init_kappa(documents, K, V, A, interactions): 
    # read in documents and vocab
    flat_documents = [item for sublist in documents for item in sublist]
    m = []

    total_sum = sum(n for _, n in flat_documents)

    for elem in flat_documents: 
        m.append(elem[1] / total_sum)

    m = np.log(m) - np.log(np.mean(m)) #logit of m


    #Defining parameters
    aspectmod = A > 1 # if there is more than one topical content variable
    if(aspectmod):
        interact = interactions # allow for the choice to interact
    else:
        interact = FALSE

    #Create the parameters object
    parLength = K + A * aspectmod + (K*A)*interact

    #create covariates. one element per item in parameter list.
    #generation by type because its conceptually simpler
    if not aspectmod & interact:
        covar = {'k': np.arange(K),
             'a': np.repeat(np.nan, parLength), #why parLength? 
             'type': np.repeat(1, K)}

    if(aspectmod & interact == False):
        covar = {'k': np.append(np.arange(K), np.repeat(np.nan, A)),
                 'a': np.append(np.repeat(np.nan, K), np.arange(A)), 
                 'type': np.append(np.repeat(1, K), np.repeat(2, A))}      
    if(interact):
        covar = {'k': np.append(np.arange(K), np.append(np.repeat(np.nan, A), np.repeat(np.arange(K), A))),
                 'a': np.append(np.repeat(np.nan, K), np.append(np.arange(A), np.repeat(np.arange(A), K))), 
                 'type': np.append(np.repeat(1, K), np.append(np.repeat(2, A),  np.repeat(3,K*A)))}

    kappa = {'out': {'m':m,
                     'params' : np.tile(np.repeat(0,V), (parLength, 1)),
                     'covar' : covar
                     #'kappasum':, why rolling sum?
                    }
            }

    return(kappa['out'])

### Make Topic Matrix

In [26]:
prevalence = 'blog'
content = 'rating'

In [27]:
def makeTopMatrix(x, data=None):
    return(data.loc[:,x])

In [28]:
xmat = makeTopMatrix(content, data)

In [29]:
yvar = makeTopMatrix(content, data)
yvar = yvar.astype('category')
yvarlevels = set(yvar)
betaindex = yvar.cat.codes

In [30]:
A = len(set(betaindex))

# Setting control variables

In [31]:

K = 10 #settings.dim
V = len(dictionary) #settings.dim
N = len(documents) #settings.dim

interactions = True #settings.kappa
verbose = True

init_type = "Random" #settings.init
ngroups = 1 #settings.ngroups
max_em_its = 15 #settings.convergence
emtol = 1e-5 #settings.convergence

#gamma_prior=("Pooled","L1") # settings.gamma.prior
#sigma_prior=0 #settings.sigma.prior
#kappa_prior=("L1","Jeffreys") # settings.kappa.prior

#Initialize parameters

settings = {
    'dim':{
        'K': K, #number of topics
        'V' :V, #number of words
        'A' : A, #dimension of topical content
        'N' : N,
        'wcounts':V
    },
    'kappa':{
        'interactions':True,
        'fixedintercept': True,
        'contrats': False,
        'mstep': {'tol':0.01, 'maxit':5}},
    'tau':{
        'mode': np.nan,
        'tol': 1e-5,
        'enet':1,
        'nlambda':250,
        'lambda.min.ratio':.001,
        'ic.k':2,
        'maxit':1e4},
    'init':{
        'mode':init_type, 
        'nits':20,
        'burnin':25,
        'alpha':50/K,
        'eta':.01,
        's':.05,
        'p':3000},
    'convergence':{
        'max.em.its':max_em_its,
        'em.converge.thresh':emtol,
        'allow.neg.change':True,},
    'covariates':{
        'X':xmat,
        'betaindex':betaindex,
        'yvarlevels':yvarlevels,
        'formula': prevalence,},
    'gamma':{
        'mode':np.nan,
        'prior':np.nan,
        'enet':1, 
        'ic.k':2,
        'maxits':1000,},
    'sigma':{
        #'prior':sigma_prior,
        'ngroups':ngroups,},
}

In [32]:
def stm_control(documents, vocab, settings, model=None):
    
    ##########
    #Step 1: Initialize Parameters
    ##########
    
    #ngroups = settings$ngroups
    
    if model == None:
        print('Call init_stm()')
        model = init_stm(documents, settings) #initialize
    else: 
        model = model
        
    # unpack initialized model
    
    mu = model['mu']
    sigma = model['sigma']
    lambd = model['lambda'] 
    beta = {'beta': model['beta'],
            'kappa': model['kappa']}
    
    convergence = None
    
    #discard the old object
    del model
    
    betaindex = settings['covariates']['betaindex']
    
    #Pull out some book keeping elements
    ntokens = settings['dim']['wcounts']
    betaindex = settings['covariates']['betaindex']
    stopits = True
    
    ############
    #Step 2: Run EM
    ############
    
    while stopits == True:
        #####
        # Non-Blocked Updates
        #####
        t1 = time.process_time()

        #run the model
        #suffstats = estep(documents=documents, beta_index=betaindex,
        #                 update_mu=(!is.null(mu$gamma)),
        #                 beta$beta, lambda, mu$mu, sigma,
        #                 verbose)
        
        print("Completed E-Step ({} seconds). \n".format(math.floor((time.process_time()-t1))))


        #unpack variables 
        
        #t1 = process_time()
        #sigma_ss = suffstats['sigma']
        #lambd <- suffstats['lambd']
        #beta_ss <- suffstats['beta']
        #bound_ss <- suffstats['bound']
        #do the m-step
        #mu = opt_mu(lambd=lambd,
        #            mode=settings['gamma']['mode'],
        #            covar=settings['covariates']['X'],
        #            enet=settings['gamma']['enet'],ic.k=settings$gamma$ic.k,
        #            maxits=settings['gamma']['maxits'])
        #sigma = opt_sigma(nu=sigma_ss, lambd=lambd,
        #                     mu=mu['mu'], sigprior=settings['sigma']['prior'])
        #beta = opt_beta(beta_ss, beta['kappa'], settings)

In [33]:
stm_control(documents, vocab, settings)

Call init_stm()
Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seco


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete


Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complet

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Completed E-Step (0 seconds). 

Complete

KeyboardInterrupt: 

From **A Model of Text for Experimentation in the Social Sciences (Roberts et al.)**:
- $\beta_k$ is a V-dimensional probability mass function that controls the frequency according to which terms are generated from that topic.

From **The Structural Topic Model and Applied Social Science (Roberts et al.)**:
- 'For the nonconjugate logistic normal variables in the E-step we use a Laplace approximation.'

In [34]:
    
    model = init_stm(documents, settings) #initialize
    
        
    # unpack initialized model
    
    mu = model['mu']
    sigma = model['sigma']
    lambd = model['lambda'] 
    beta = {'beta': model['beta'],
            'kappa': model['kappa']}
    
    convergence = None
    
    #discard the old object
    del model
    
    betaindex = settings['covariates']['betaindex']
    
    #Pull out some book keeping elements
    ntokens = settings['dim']['wcounts']
    betaindex = settings['covariates']['betaindex']
    stopits = True

# Let's start by assuming its one beta and we may have arbitrarily subset the number of docs.
```python
> def estep(documents=documents, beta_index=betaindex,
          update_mu=True,
          beta['beta'], lambd, mu, sigma,
          verbose):
```

In [None]:
def e_step():
    #quickly define useful constants
    V = beta['beta'][0].shape[1] # ncol
    K = beta['beta'][0].shape[0] # nrow
    N = len(documents)
    A = len(beta['beta'])
    
    # 1) Initialize Sufficient Statistics 
    sigma_ss = np.zeros(((K-1),(K-1)))
    if A == 2: 
        beta_ss = [np.zeros((K,V)), np.zeros((K,V))]
    else:
        print('Error: Only two metadata columns allowed.')
    bound = np.repeat(0,N)
    lambd = np.repeat(0,N)
    
    # 2) Precalculate common components
    sigobj = np.linalg.cholesky(sigma)
    sigmaentropy = np.sum(np.log(np.diag(sigobj)))
    siginv = np.linalg.inv(sigobj).T*np.linalg.inv(sigobj)
    
    # 3) Document Scheduling
    # For right now we are just doing everything in serial.
    # the challenge with multicore is efficient scheduling while
    # maintaining a small dimension for the sufficient statistics.
    for i in range(N): 
        # update components
        doc = documents[i]
        words = [x for x,y in doc]
        aspect = betaindex[i]
        init = lambd[i]
        if update_mu: mu_i = mu[i]
        beta_i = beta['beta'][aspect][:,[words]]
        
        #infer the document with the logistic normal distribution
        doc_results = 
        
        # update sufficient statistics

In [151]:
def logisticnormalcpp(eta=init,
                      mu=mu_i,
                      siginv,
                      beta=beta_i,
                      doc,
                      sigmaentropy,
                      method="BFGS",
                      control=list(maxit=500),
                      hpbcpp=True):
    
    doc_ct = [y for x,y in doc] #count of words in document
    Ndoc = np.sum(doc_ct)
    
    
    
    

SyntaxError: non-default argument follows default argument (3686581265.py, line 3)

## Optimize

In [None]:
for(i in 1:N) {
    #update components
    doc <- documents[[i]]
    words <- doc[1,]
    aspect <- beta.index[i]
    init <- lambda.old[i,]
    if(update.mu) mu.i <- mu[,i]
    beta.i <- beta[[aspect]][,words,drop=FALSE]
    
    #infer the document
    doc.results <- logisticnormalcpp(eta=init, mu=mu.i, siginv=siginv, beta=beta.i, 
                                  doc=doc, sigmaentropy=sigmaentropy)
    
    # update sufficient statistics 
    sigma.ss <- sigma.ss + doc.results$eta$nu
    beta.ss[[aspect]][,words] <- doc.results$phis + beta.ss[[aspect]][,words]
    bound[i] <- doc.results$bound
    lambda[[i]] <- c(doc.results$eta$lambda)
    if(verbose && i%%ctevery==0) cat(".")
  }