In [1]:
import numpy as np
import scipy
from scipy.stats import dirichlet, multinomial
import pandas as pd

In [2]:
tau = [0.2, 0.2, 0.6]
alpha = [0.4, 5, 15] # topics
alpha_d = [1,1,3] #documents

In [3]:
dirichlet.pdf(tau, alpha)

0.2843831684937255

In [4]:
phi = dirichlet.rvs(alpha, random_state=47)
phi[0]


array([1.86009159e-04, 1.31354480e-01, 8.68459511e-01])

In [5]:
theta = dirichlet.rvs(alpha_d, random_state=47)
theta

array([[0.02274301, 0.69259478, 0.28466221]])

In [6]:
#Beta distribution using theta -> document distribution
scipy.stats.beta.fit(data=theta)

(0.3601319176763589,
 0.46372346908583995,
 0.010288130317208238,
 0.6823066529408928)

In [7]:
#documents = ["Cars are fast", "Trains are faster"]
words = ['cars','trains','bike','fast','slow','rapid','sluggish','trash','garbage','refuse']
V = 5
K = 3 #Vehicles, Speed, Junk

In [8]:
doc_len = [9,11,16]
taus_topic = [[.9,.05,.05],
              [.1,.8,.1],
              [.1,.1,.8]]

taus_word = [[0.2,0.2,0.2,0.05,0.05,0.05,0.05,0.05,0.05,0.05],
             [0.05,0.05,0.05,0.05,0.05,0.05,0.05,0.2,0.2,0.2],
             [0.05,0.05,0.05,0.2,0.2,0.15,0.15,0.05,0.05,0.05]
             ]
alpha_0 = 5
alpha_1 = 5
documents = []

for k in range(len(doc_len)):
    words_doc = []
    for i in range(doc_len[k]):
        topic_prob = dirichlet.rvs([alpha_0 * t for t in taus_topic[k]])
        topic = list(multinomial.rvs(1,topic_prob[0]))
        topic = topic.index(1)
        #print(topic_prob)
        word_prob = dirichlet.rvs([alpha_1*t for t in taus_word[topic]])
        word_ind = list(multinomial.rvs(1,word_prob[0]))
        word_ind = word_ind.index(1)
        words_doc.append(words[word_ind])
    documents.append(words_doc)

#documents

In [9]:
new_documents = [' '.join(d) for d in documents]
new_documents

['bike rapid trains trash trash trains trains trains fast',
 'trash cars refuse trash sluggish bike refuse cars fast slow trash',
 'sluggish sluggish rapid sluggish garbage trains slow trash slow rapid trash fast fast rapid fast trash']

## Running Gibbs sampler on small data set

In [10]:
raw_docs = [
    "eat turkey on turkey day holiday",
    "i like to eat cake on holiday",
    "turkey trot race on thanksgiving holiday",
    "snail race the turtle",
    "time travel space race",
    "movie on thanksgiving",
    "movie at air and space museum is cool movie",
    "aspiring movie star"
]

docs = [d.split() for d in raw_docs]

vocab = list(set(' '.join(raw_docs).split()))
vocab,len(vocab)

(['day',
  'turkey',
  'i',
  'the',
  'turtle',
  'star',
  'travel',
  'space',
  'on',
  'to',
  'holiday',
  'museum',
  'is',
  'cool',
  'thanksgiving',
  'cake',
  'aspiring',
  'movie',
  'eat',
  'trot',
  'and',
  'like',
  'race',
  'time',
  'at',
  'air',
  'snail'],
 27)

In [11]:
#create word ids
mapped_docs = []
for doc in docs:
    new_doc = []
    vectorized_doc = doc 
    for i in range(len(doc)):
        vectorized_doc[i] = vocab.index(doc[i])
    mapped_docs.append(vectorized_doc)

        
mapped_docs

[[18, 1, 8, 1, 0, 10],
 [2, 21, 9, 18, 15, 8, 10],
 [1, 19, 22, 8, 14, 10],
 [26, 22, 3, 4],
 [23, 6, 7, 22],
 [17, 8, 14],
 [17, 24, 25, 20, 7, 11, 12, 13, 17],
 [16, 17, 5]]

In [27]:
K=2

#topic-word matrix
tw_matrix = np.zeros((K,len(vocab)))

#topic assignment list
ta_list = [np.zeros((1,len(d)))[0] for d in docs]

#document-topic matrix
dt_matrix = np.zeros((len(docs),K))



In [28]:
#Randomly intitialize
np.random.seed(47)

for d in range(len(docs)):
    for w in range(len(mapped_docs[d])):
        #print(f'{d},{w}')
        ta_list[d][w] = np.random.randint(0,K)
        
        ti = int(ta_list[d][w])
        wi = int(mapped_docs[d][w])
        #print(f'{ti},{wi}')
        tw_matrix[ti, wi] = tw_matrix[ti][wi] + 1
    
    for t in range(K):
        #Number of words in document d with topic assignment t
        dt_matrix[d, t] = np.where(ta_list[d] == t)[0].shape[0]

In [29]:
pd.DataFrame(tw_matrix)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,17,18,19,20,21,22,23,24,25,26
0,1.0,2.0,1.0,0.0,1.0,0.0,0.0,2.0,1.0,0.0,...,2.0,1.0,0.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0
1,0.0,1.0,0.0,1.0,0.0,1.0,1.0,0.0,3.0,1.0,...,2.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,0.0,1.0


In [30]:
pd.DataFrame(dt_matrix)

Unnamed: 0,0,1
0,3.0,3.0
1,2.0,5.0
2,3.0,3.0
3,1.0,3.0
4,2.0,2.0
5,1.0,2.0
6,6.0,3.0
7,1.0,2.0


In [31]:
#calculating P(z_i|*)
alpha = 1
eta = 1

init_topic = int(ta_list[0][0])
word_id = docs[0][0]

#z_-i term, 
dt_matrix[0, init_topic] = dt_matrix[1, init_topic] - 1
tw_matrix[init_topic, word_id] = tw_matrix[init_topic, word_id] - 1

#word topic  mean
wt_means = (tw_matrix[:, word_id] + eta) / (tw_matrix.sum(axis=1) + len(vocab)*eta)
dt_means = (dt_matrix[0,:]+alpha) / (dt_matrix[0,:].sum() + K*alpha )

probs = wt_means*dt_means
wt_means,dt_means, probs,sum(probs)

(array([0.04347826, 0.02040816]),
 array([0.44444444, 0.55555556]),
 array([0.01932367, 0.01133787]),
 0.030661539978310164)

In [32]:
probs = probs/probs.sum()
probs

array([0.63022508, 0.36977492])

In [33]:
new_topic = np.random.choice(range(K),p=probs)
new_topic

0

In [34]:
init_topic

1

In [20]:
ta_list[0][0]

1.0