In [24]:
import numpy as np
import pandas as pd
from gensim import corpora

In [50]:
#Initialize parameters
K = 10 #number of topics
V = 100 #number of words
A = 2 #dimension of topical content
N = 20 #number of documents
interactions = True

### Ingest corpus to create documents and vocab

In [43]:
# load corpus
documents = corpora.MmCorpus('corpus.mm')
dictionary = corpora.Dictionary.load('dictionary')
# Vocabulary
dictionary[0]
vocab = dictionary.id2token

 This function uses baseline probablities to create covariates for the parameter list and returns kappa. 

## Construction of the topical content variable kappa

- kappa_t has dimension KxV 
- kappa_c has dimension AxV
- kappa_i has dimension AxKxV

The initialisation of kappa depends therefore on the number of topics (K), the number of content covariates (A) 
and the vocabulary length (V). 

To compute the baseline probabilities $m$, the vector of word-counts by document needs to be unlisted and then normalized. Hence $\textit{documents}$, need to be transformed to a vector of indexed counts with length $V$.

In [163]:
def init_kappa(documents, K, V, A, interactions): 
    # read in documents and vocab
    flat_documents = [item for sublist in documents for item in sublist]
    m = []

    total_sum = sum(n for _, n in flat_documents)

    for elem in flat_documents: 
        m.append(elem[1] / total_sum)

    m = np.log(m) - np.log(np.mean(m)) #logit of m


    #Defining parameters
    aspectmod = A > 1 # if there is more than one topical content variable
    if(aspectmod):
        interact = interactions # allow for the choice to interact
    else:
        interact = FALSE

    #Create the parameters object
    parLength = K + A * aspectmod + (K*A)*interact

    #create covariates. one element per item in parameter list.
    #generation by type because its conceptually simpler
    if not aspectmod & interact:
        covar = {'k': np.arange(K),
             'a': np.repeat(np.nan, parLength), #why parLength? 
             'type': np.repeat(1, K)}

    if(aspectmod & interact == False):
        covar = {'k': np.append(np.arange(K), np.repeat(np.nan, A)),
                 'a': np.append(np.repeat(np.nan, K), np.arange(A)), 
                 'type': np.append(np.repeat(1, K), np.repeat(2, A))}      
    if(interact):
        covar = {'k': np.append(np.arange(K), np.append(np.repeat(np.nan, A), np.repeat(np.arange(K), A))),
                 'a': np.append(np.repeat(np.nan, K), np.append(np.arange(A), np.repeat(np.arange(A), K))), 
                 'type': np.append(np.repeat(1, K), np.append(np.repeat(2, A),  np.repeat(3,K*A)))}

    kappa = {'out': {'m':m,
                     'params' : np.tile(np.repeat(0,V), (parLength, 1)),
                     'covar' : covar
                     #'kappasum':, why rolling sum?
                    }
            }

    return(kappa['out'])

In [164]:
kappa_initialized = init_kappa(documents, K, A, V, interactions)