In [1]:
import numpy as np
import time
import pandas as pd
import numpy.random as random
import matplotlib.pyplot as plt
import math
from gensim import corpora
from scipy import optimize
import scipy

In [2]:
#load data
data = pd.read_csv('poliblogs2008.csv')

### Ingest corpus to create documents and vocab

In [3]:
# load corpus
documents = corpora.MmCorpus('corpus.mm')
dictionary = corpora.Dictionary.load('dictionary')

In [4]:
vocab = dictionary.token2id

In [5]:
def init_stm(documents, settings): 
      
    K = settings['dim']['K']
    V = settings['dim']['V']
    A = settings['dim']['A']
    N = settings['dim']['N']
    
    #Random initialization
    mu = np.array([0]*(K-1))[:,None]
    sigma = np.zeros(((K-1),(K-1)))
    diag = np.diagonal(sigma, 0)
    diag.setflags(write=True)
    diag.fill(20)
    beta = random.gamma(.1,1, V*K).reshape(K,V)
    beta = (beta / beta.sum(axis=1)[:,None])
    lambd = np.zeros((N, (K-1)))
    
    #turn beta into a list and assign it for each aspect
    beta = [beta, beta] # FOR A=2
    kappa_initialized = init_kappa(documents, K, V, A, interactions=settings['kappa']['interactions'])
    
    #create model object
    model = {'mu':mu, 'sigma':sigma, 'beta': beta, 'lambda': lambd, 'kappa':kappa_initialized}
    
    return(model)

def init_kappa(documents, K, V, A, interactions): 
    # read in documents and vocab
    flat_documents = [item for sublist in documents for item in sublist]
    m = []

    total_sum = sum(n for _, n in flat_documents)

    for elem in flat_documents: 
        m.append(elem[1] / total_sum)

    m = np.log(m) - np.log(np.mean(m)) #logit of m


    #Defining parameters
    aspectmod = A > 1 # if there is more than one topical content variable
    if(aspectmod):
        interact = interactions # allow for the choice to interact
    else:
        interact = FALSE

    #Create the parameters object
    parLength = K + A * aspectmod + (K*A)*interact

    #create covariates. one element per item in parameter list.
    #generation by type because its conceptually simpler
    if not aspectmod & interact:
        covar = {'k': np.arange(K),
             'a': np.repeat(np.nan, parLength), #why parLength? 
             'type': np.repeat(1, K)}

    if(aspectmod & interact == False):
        covar = {'k': np.append(np.arange(K), np.repeat(np.nan, A)),
                 'a': np.append(np.repeat(np.nan, K), np.arange(A)), 
                 'type': np.append(np.repeat(1, K), np.repeat(2, A))}      
    if(interact):
        covar = {'k': np.append(np.arange(K), np.append(np.repeat(np.nan, A), np.repeat(np.arange(K), A))),
                 'a': np.append(np.repeat(np.nan, K), np.append(np.arange(A), np.repeat(np.arange(A), K))), 
                 'type': np.append(np.repeat(1, K), np.append(np.repeat(2, A),  np.repeat(3,K*A)))}

    kappa = {'out': {'m':m,
                     'params' : np.tile(np.repeat(0,V), (parLength, 1)),
                     'covar' : covar
                     #'kappasum':, why rolling sum?
                    }
            }

    return(kappa['out'])

In [251]:
def softmax(x):
    """Compute softmax values for each sets of scores in x."""
    e_x = np.exp(x - np.max(x))
    return e_x / e_x.sum(axis=0)

def softmax_weights(x, weight):
    """Compute softmax values for each sets of scores in x."""
    e_x = weight*np.exp(x - np.max(x))[:,None]
    return e_x / e_x.sum(axis=0)

def lhood(eta, mu, siginv, Ndoc):
    
    #formula
    part1 = np.sum(doc_ct * (eta_long.max() + np.log(np.exp(eta_long - eta_long.max())@beta_tuple)))-np.sum(doc_ct)*scipy.special.logsumexp(eta)
    part2 = .5*(eta-mu)@siginv@(eta-mu)
    
    out = part2 - part1
    
    return -out

def grad(eta, mu, siginv, Ndoc):

    #formula
    part1 = np.delete(np.sum(phi * doc_ct,axis=1) - np.sum(doc_ct)*theta, neta)
    part2 = siginv@(eta-mu)

    return part2 - part1

def logisticnormalcpp(
    eta,
    #eta_long,
    mu,
    siginv,
    beta_i,
    phi,
    doc_ct,
    Ndoc):
    
    # initial values
    #eta_long = np.insert(eta,-1,0)
    #beta = beta_i.reshape(10,beta_i.shape[2])
    #neta = len(eta)
    #theta = softmax(eta_long)
    #phi = softmax_weights(eta_long, beta)
    
    optim_out = optimize.fmin_bfgs(lhood,
                                   x0=eta,
                                   args=(beta_tuple, mu, siginv, Ndoc),
                                   fprime=grad)
    return optim_out

In [252]:
def e_step(documents, mu, sigma, lambd, beta):
    #quickly define useful constants
    V = beta['beta'][0].shape[1] # ncol
    K = beta['beta'][0].shape[0] # nrow
    N = len(documents)
    A = len(beta['beta'])
    
    # 1) Initialize Sufficient Statistics 
    sigma_ss = np.zeros(((K-1),(K-1)))
    if A == 2: 
        beta_ss = [np.zeros((K,V)), np.zeros((K,V))]
    else:
        print('Error: Only two metadata columns allowed.')
    bound = np.repeat(0,N)
    #lambd = np.repeat(0,N)
    
    # 2) Precalculate common components
    sigobj = np.linalg.cholesky(sigma)
    sigmaentropy = np.sum(np.log(np.diag(sigobj)))
    siginv = np.linalg.inv(sigobj).T*np.linalg.inv(sigobj)
    
    # 3) Document Scheduling
    # For right now we are just doing everything in serial.
    # the challenge with multicore is efficient scheduling while
    # maintaining a small dimension for the sufficient statistics.
    
    mu = mu.flatten()


        #set parameters for one document (i)
    for i in range(20):

        eta=lambd[i]
        neta = len(eta)
        eta_long = np.insert(eta,-1,0)

        doc = documents[i]
        words = [x for x,y in doc]
        aspect = betaindex[i]
        beta_i = beta['beta'][aspect][:,[words]]


        #set document specs
        doc_ct = np.array([y for x,y in doc]) #count of words in document
        Ndoc = np.sum(doc_ct)

        # initial values
        beta_tuple = beta_i.reshape(10,beta_i.shape[2])
        theta = softmax(eta_long)
        phi = softmax_weights(eta_long, beta_tuple)

        optimize.fmin_bfgs(lhood,x0=eta,
                           args=(mu, siginv, Ndoc),
                           fprime=grad)
        

        # update sufficient statistics

In [253]:
prevalence = 'blog'
content = 'rating'
num_topics = 10

# Setting control variables

In [254]:
def makeTopMatrix(x, data=None):
    return(data.loc[:,x])

xmat = makeTopMatrix(content, data)
yvar = xmat.astype('category')
yvarlevels = set(yvar)
betaindex = yvar.cat.codes
A = len(set(betaindex))

interactions = True #settings.kappa
verbose = True

init_type = "Random" #settings.init
ngroups = 1 #settings.ngroups
max_em_its = 15 #settings.convergence
emtol = 1e-5 #settings.convergence

#gamma_prior=("Pooled","L1") # settings.gamma.prior
#sigma_prior=0 #settings.sigma.prior
#kappa_prior=("L1","Jeffreys") # settings.kappa.prior

#Initialize parameters

settings = {
    'dim':{
        'K': num_topics, #number of topics
        'V' : len(dictionary), #number of words
        'A' : A, #dimension of topical content
        'N' : len(documents),
    },
    'kappa':{
        'interactions':True,
        'fixedintercept': True,
        'contrats': False,
        'mstep': {'tol':0.01, 'maxit':5}},
    'tau':{
        'mode': np.nan,
        'tol': 1e-5,
        'enet':1,
        'nlambda':250,
        'lambda.min.ratio':.001,
        'ic.k':2,
        'maxit':1e4},
    'init':{
        'mode':init_type, 
        'nits':20,
        'burnin':25,
        'alpha':50/num_topics,
        'eta':.01,
        's':.05,
        'p':3000},
    'convergence':{
        'max.em.its':max_em_its,
        'em.converge.thresh':emtol,
        'allow.neg.change':True,},
    'covariates':{
        'X':xmat,
        'betaindex':betaindex,
        'yvarlevels':yvarlevels,
        'formula': prevalence,},
    'gamma':{
        'mode':np.nan,
        'prior':np.nan,
        'enet':1, 
        'ic.k':2,
        'maxits':1000,},
    'sigma':{
        #'prior':sigma_prior,
        'ngroups':ngroups,},
}

In [255]:
def stm_control(documents, vocab, settings, model=None):
    
    ##########
    #Step 1: Initialize Parameters
    ##########
    
    #ngroups = settings$ngroups
    
    if model == None:
        print('Call init_stm()')
        model = init_stm(documents, settings) #initialize
    else: 
        model = model
        
    # unpack initialized model
    
    mu = model['mu']
    sigma = model['sigma']
    lambd = model['lambda'] 
    beta = {'beta': model['beta'],
            'kappa': model['kappa']}
    
    convergence = None
    
    #discard the old object
    del model
    
    betaindex = settings['covariates']['betaindex']
    
    #Pull out some book keeping elements
    betaindex = settings['covariates']['betaindex']
    
    ############
    #Step 2: Run EM
    ############
    
    t1 = time.process_time()

    #run the model
    e_step(documents, mu, sigma, lambd, beta)
        
    print("Completed E-Step ({} seconds). \n".format(math.floor((time.process_time()-t1))))


In [256]:
stm_control(documents, vocab, settings)

Call init_stm()
Optimization terminated successfully.
         Current function value: -21626.089291
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
Optimization terminated successfully.
         Current function value: -21626.089291
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
Optimization terminated successfully.
         Current function value: -21626.089291
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
Optimization terminated successfully.
         Current function value: -21626.089291
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
Optimization terminated successfully.
         Current function value: -21626.089291
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
Optimization terminated successfully.
         Current function value: -21626.089291
         Iterations: 2
      

From **A Model of Text for Experimentation in the Social Sciences (Roberts et al.)**:
- $\beta_k$ is a V-dimensional probability mass function that controls the frequency according to which terms are generated from that topic.

From **The Structural Topic Model and Applied Social Science (Roberts et al.)**:
- 'For the nonconjugate logistic normal variables in the E-step we use a Laplace approximation.'

# Let's start by assuming its one beta and we may have arbitrarily subset the number of docs

In [53]:
model = init_stm(documents, settings) #initialize

# unpack initialized model

mu = model['mu']
sigma = model['sigma']
lambd = model['lambda'] 
beta = {'beta': model['beta'],
        'kappa': model['kappa']}

convergence = None

#discard the old object
del model

#Pull out some book keeping elements
betaindex = settings['covariates']['betaindex']

    
#set global parameter for sigma
sigobj = np.linalg.cholesky(sigma)
sigmaentropy = np.sum(np.log(np.diag(sigobj)))
siginv = np.linalg.inv(sigobj).T*np.linalg.inv(sigobj)


In [221]:


#set parameters for one document (i)
for i in range(20):
    
    eta=lambd[i]
    neta = len(eta)
    eta_long = np.insert(eta,-1,0)
    mu = mu_i.flatten()
    doc = documents[i]
    words = [x for x,y in doc]
    aspect = betaindex[i]
    beta_i = beta['beta'][aspect][:,[words]]


    #set document specs
    doc_ct = np.array([y for x,y in doc]) #count of words in document
    Ndoc = np.sum(doc_ct)

    # initial values
    beta_tuple = beta_i.reshape(10,beta_i.shape[2])
    theta = softmax(eta_long)
    phi = softmax_weights(eta_long, beta_tuple)

    optimize.fmin_bfgs(lhood,x0=eta,
                       args=(mu, siginv, Ndoc),
                       fprime=grad)

Optimization terminated successfully.
         Current function value: -25796.797538
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
         Current function value: -2854.016890
         Iterations: 0
         Function evaluations: 99
         Gradient evaluations: 87
Optimization terminated successfully.
         Current function value: -49075.520507
         Iterations: 2
         Function evaluations: 6
         Gradient evaluations: 6
Optimization terminated successfully.
         Current function value: -19738.178390
         Iterations: 2
         Function evaluations: 5
         Gradient evaluations: 5
Optimization terminated successfully.
         Current function value: -43729.044260
         Iterations: 2
         Function evaluations: 6
         Gradient evaluations: 6
Optimization terminated successfully.
         Current function value: -88292.654804
         Iterations: 2
         Function evaluations: 6
         Gradient evaluati

## E-Step

### Likelihood Function 

$f(\hat{\eta_{d}}) \propto  - \frac{1}{2} (\eta_d-\mu_d)^T \sum^{-1}(\eta_d-\mu_d)+\big(\sum_v c_{d,v} log\sum_k \beta_{k,v} e^{\eta_{d,k}}- W_d log \sum_k e^{\eta_{d,k}}\big)$

### Gradient of the Likelihood

$\nabla f(\eta_d)_k = (\sum c_{d,v} \langle \phi_{d,v,k} \rangle) - W_d\theta_{d,k}-\big(\sum^{-1}(\eta_d-\mu_d)\big)_k$

with $\theta_{d,k} = \frac{exp(\eta)}{\sum exp(\eta)}$


and $\langle\phi_{d,k,v}\rangle = \frac{exp(\eta_{d,k}) \beta_{d,v,k}}{\sum_k exp(\eta_{d,k}) \beta_{d,v,k}} $