In [88]:
import numpy as np
import time
import pandas as pd
import numpy.random as random
import matplotlib.pyplot as plt
import math
from gensim import corpora

In [7]:
#load data
data = pd.read_csv('poliblogs2008.csv')

In [8]:
# load corpus
documents = corpora.MmCorpus('corpus.mm')
dictionary = corpora.Dictionary.load('dictionary')
# Vocabulary
dictionary[0]
vocab = dictionary.id2token

### Ingest corpus to create documents and vocab

In [9]:
def init_stm(documents, settings): 
      
    K = settings['dim']['K']
    V = settings['dim']['V']
    A = settings['dim']['A']
    N = settings['dim']['N']
    
    #Random initialization
    mu = np.array([0]*(K-1))
    sigma = np.zeros(((K-1),(K-1)))
    diag = np.diagonal(sigma, 0)
    diag.setflags(write=True)
    diag.fill(20)
    beta = random.gamma(.1,1, V*K).reshape(K,V)
    beta = (beta.T/beta.sum(axis=1)).T
    lambd = np.zeros((N, (K-1)))
    
    #turn beta into a list and assign it for each aspect
    beta = np.repeat(list(beta),A)
    kappa_initialized = init_kappa(documents, K, V, A, interactions=settings['kappa']['interactions'])
    
    #create model object
    model = {'mu':mu, 'sigma':sigma, 'beta': beta, 'lambda': lambd, 'kappa':kappa_initialized}
    
    return(model)

def init_kappa(documents, K, V, A, interactions): 
    # read in documents and vocab
    flat_documents = [item for sublist in documents for item in sublist]
    m = []

    total_sum = sum(n for _, n in flat_documents)

    for elem in flat_documents: 
        m.append(elem[1] / total_sum)

    m = np.log(m) - np.log(np.mean(m)) #logit of m


    #Defining parameters
    aspectmod = A > 1 # if there is more than one topical content variable
    if(aspectmod):
        interact = interactions # allow for the choice to interact
    else:
        interact = FALSE

    #Create the parameters object
    parLength = K + A * aspectmod + (K*A)*interact

    #create covariates. one element per item in parameter list.
    #generation by type because its conceptually simpler
    if not aspectmod & interact:
        covar = {'k': np.arange(K),
             'a': np.repeat(np.nan, parLength), #why parLength? 
             'type': np.repeat(1, K)}

    if(aspectmod & interact == False):
        covar = {'k': np.append(np.arange(K), np.repeat(np.nan, A)),
                 'a': np.append(np.repeat(np.nan, K), np.arange(A)), 
                 'type': np.append(np.repeat(1, K), np.repeat(2, A))}      
    if(interact):
        covar = {'k': np.append(np.arange(K), np.append(np.repeat(np.nan, A), np.repeat(np.arange(K), A))),
                 'a': np.append(np.repeat(np.nan, K), np.append(np.arange(A), np.repeat(np.arange(A), K))), 
                 'type': np.append(np.repeat(1, K), np.append(np.repeat(2, A),  np.repeat(3,K*A)))}

    kappa = {'out': {'m':m,
                     'params' : np.tile(np.repeat(0,V), (parLength, 1)),
                     'covar' : covar
                     #'kappasum':, why rolling sum?
                    }
            }

    return(kappa['out'])

### Make Topic Matrix

In [17]:
prevalence = 'blog'
content = 'rating'

In [18]:
def makeTopMatrix(x, data=None):
    return(data.loc[:,x])

In [19]:
xmat = makeTopMatrix(content, data)

In [22]:
yvar = makeTopMatrix(content, data)
yvar = yvar.astype('category')
yvarlevels = set(yvar)
betaindex = yvar.cat.codes

In [23]:
A = len(set(betaindex))

# Setting control variables

In [53]:

K = 10 #settings.dim
V = len(vocab) #settings.dim
N = len(documents) #settings.dim

interactions = True #settings.kappa
verbose = True

init_type = "Random" #settings.init
ngroups = 1 #settings.ngroups
max_em_its = 15 #settings.convergence
emtol = 1e-5 #settings.convergence

#gamma_prior=("Pooled","L1") # settings.gamma.prior
#sigma_prior=0 #settings.sigma.prior
#kappa_prior=("L1","Jeffreys") # settings.kappa.prior

#Initialize parameters

settings = {
    'dim':{
        'K': K, #number of topics
        'V' :V, #number of words
        'A' : A, #dimension of topical content
        'N' : N,
        'wcounts':V
    },
    'kappa':{
        'interactions':True,
        'fixedintercept': True,
        'contrats': False,
        'mstep': {'tol':0.01, 'maxit':5}},
    'tau':{
        'mode': np.nan,
        'tol': 1e-5,
        'enet':1,
        'nlambda':250,
        'lambda.min.ratio':.001,
        'ic.k':2,
        'maxit':1e4},
    'init':{
        'mode':init_type, 
        'nits':20,
        'burnin':25,
        'alpha':50/K,
        'eta':.01,
        's':.05,
        'p':3000},
    'convergence':{
        'max.em.its':max_em_its,
        'em.converge.thresh':emtol,
        'allow.neg.change':True,},
    'covariates':{
        'X':xmat,
        'betaindex':betaindex,
        'yvarlevels':yvarlevels,
        'formula': prevalence,},
    'gamma':{
        'mode':np.nan,
        'prior':np.nan,
        'enet':1, 
        'ic.k':2,
        'maxits':1000,},
    'sigma':{
        #'prior':sigma_prior,
        'ngroups':ngroups,},
}

In [99]:
def stm_control(documents, vocab, settings, model=None):
    
    ##########
    #Step 1: Initialize Parameters
    ##########
    
    #ngroups = settings$ngroups
    
    if model == None:
        model = init_stm(documents, settings) #initialize
    else: 
        model = model
        
    # unpack initialized model
    
    mu = model['mu']
    sigma = model['sigma']
    lambd = model['lambda'] 
    beta = {'beta': model['beta'],
            'kappa': model['kappa']}
    
    convergence = None
    
    #discard the old object
    del model
    
    betaindex = settings['covariates']['betaindex']
    
    #Pull out some book keeping elements
    ntokens = settings['dim']['wcounts']
    betaindex = settings['covariates']['betaindex']
    stopits = True
    
    ############
    #Step 2: Run EM
    ############
    
    while stopits == False:
        #####
        # Non-Blocked Updates
        #####
        t1 = time.process_time()

        #run the model
        #suffstats = estep(documents=documents, beta_index=betaindex,
        #                 update_mu=(!is.null(mu$gamma)),
        #                 beta$beta, lambda, mu$mu, sigma,
        #                 verbose)
        
        print("Completed E-Step ({} seconds). \n".format(math.floor((time.process_time()-t1))))


        #unpack variables 
        
        #t1 = process_time()
        #sigma_ss = suffstats['sigma']
        #lambd <- suffstats['lambd']
        #beta_ss <- suffstats['beta']
        #bound_ss <- suffstats['bound']
        #do the m-step
        #mu = opt_mu(lambd=lambd,
        #            mode=settings['gamma']['mode'],
        #            covar=settings['covariates']['X'],
        #            enet=settings['gamma']['enet'],ic.k=settings$gamma$ic.k,
        #            maxits=settings['gamma']['maxits'])
        #sigma = opt_sigma(nu=sigma_ss, lambd=lambd,
        #                     mu=mu['mu'], sigprior=settings['sigma']['prior'])
        #beta = opt_beta(beta_ss, beta['kappa'], settings)

In [100]:
stm_control(documents, vocab, settings)