# LDA Training
<figure>
<img src=https://s2.loli.net/2022/02/28/X7vzOlDHJtP6UnM.png style="width: 600px">
<figcaption>The LDA training algorithm from <a href=http://www.arbylon.net/publications/text-est.pdf>Parameter estimation for text analysis</a></figcaption>
</figure>

In [3]:
import random
import numpy as np
from collections import defaultdict, OrderedDict
from types import SimpleNamespace
from tqdm.notebook import tqdm

In [33]:
# === corpus loading ===
class NeurIPSCorpus:
    def __init__(self, data_path, num_topics, max_num_docs=100, max_num_words=10000, max_doc_length=1000):
        self.docs = []
        self.word2id = OrderedDict()
        self.max_doc_length = max_doc_length

        word2cnt = defaultdict(int)
        with open(data_path) as fin:
            for i, line in enumerate(fin):
                if i >= max_num_docs: break
                for word in line.strip().split():
                    word2cnt[word] += 1
        
        word2cnt = sorted(list(word2cnt.items()), key=lambda x: x[1], reverse=True)
        if len(word2cnt) > max_num_words:
            word2cnt = word2cnt[:max_num_words]
        word2cnt = dict(word2cnt)

        with open(data_path) as fin:
            for i, line in enumerate(fin):
                if i >= max_num_docs: break
                doc = []
                for word in line.strip().split():
                    if len(doc) >= self.max_doc_length: break
                    if word not in word2cnt: continue
                    if word not in self.word2id: 
                        self.word2id[word] = len(self.word2id)
                    doc.append(self.word2id[word])
                self.docs.append(doc)

        self.num_docs = len(self.docs)
        self.num_topics = num_topics
        self.num_words = len(self.word2id)
        self.id2word = {v: k for k, v in self.word2id.items()}
        print(
            "num_docs:", self.num_docs, 
            "num_topics:", self.num_topics, 
            "num_words:", self.num_words
        )

corpus = NeurIPSCorpus(
    data_path="data/papers.txt", 
    num_topics=10,
    max_num_docs=1000,
    max_num_words=10000,
    max_doc_length=200,
)
hparams = SimpleNamespace(
    alpha=np.ones([corpus.num_topics], dtype=float) / corpus.num_topics,
    beta = np.ones([corpus.num_words], dtype=float) / corpus.num_topics,
    gibbs_sampling_max_iters=1000,
)

num_docs: 1000 num_topics: 10 num_words: 7882


In [34]:
# === initialization ===
print("Initializing...", flush=True)
n_doc_topic = np.zeros([corpus.num_docs, corpus.num_topics], dtype=float) # n_m^(k)
n_topic_word = np.zeros([corpus.num_topics, corpus.num_words], dtype=float) # n_k^(t)
z_doc_word = np.zeros([corpus.num_docs, corpus.max_doc_length], dtype=int)

for doc_i in range(corpus.num_docs):
    for j, word_j in enumerate(corpus.docs[doc_i]):
        topic_ij = random.randint(0, corpus.num_topics - 1)
        n_doc_topic[doc_i, topic_ij] += 1
        n_topic_word[topic_ij, word_j] += 1
        z_doc_word[doc_i, j] = topic_ij

# === Gibbs sampling ===
print("Gibbs sampling...", flush=True)
for iteration in tqdm(range(hparams.gibbs_sampling_max_iters)):
    for doc_i in range(corpus.num_docs):
        for j, word_j in enumerate(corpus.docs[doc_i]):
            # remove the old assignment
            topic_ij = z_doc_word[doc_i, j]
            n_doc_topic[doc_i, topic_ij] -= 1
            n_topic_word[topic_ij, word_j] -= 1
            # compute the new assignment
            p_doc_topic = (n_doc_topic[doc_i, :] + hparams.alpha) \
                        / np.sum(n_doc_topic[doc_i] + hparams.alpha)
            p_topic_word = (n_topic_word[:, word_j] + hparams.beta[word_j]) \
                        / np.sum(n_topic_word + hparams.beta, axis=1)
            p_topic = p_doc_topic * p_topic_word
            p_topic /= np.sum(p_topic)
            # record the new assignment
            new_topic_ij = np.random.choice(np.arange(corpus.num_topics), p=p_topic)
            n_doc_topic[doc_i, new_topic_ij] += 1
            n_topic_word[new_topic_ij, word_j] += 1
            z_doc_word[doc_i, j] = new_topic_ij

    if iteration % 10 == 0:
        print(f"Iter [{iteration}]===")
        # === Check convergence and read out parameters ===
        theta = (n_doc_topic + hparams.alpha) / np.sum(n_doc_topic + hparams.alpha, axis=1, keepdims=True)
        phi = (n_topic_word + hparams.beta) / np.sum(n_topic_word + hparams.beta, axis=1, keepdims=True)

        for topic in range(corpus.num_topics):
            top_words = np.argsort(phi[topic])[::-1][:10]
            top_probs = phi[topic, top_words]
            top_words = [corpus.id2word[word] for word in top_words]
            print(f"Topic {topic}:", top_words)

Initializing...
Gibbs sampling...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=1000.0), HTML(value='')))

Iter [0]===
Topic 0: ['model', 'learning', 'method', 'problem', 'network', 'algorithm', 'abstract', 'datum', 'neural', 'system']
Topic 1: ['model', 'learning', 'algorithm', 'network', 'neural', 'system', 'function', 'information', 'approach', 'base']
Topic 2: ['network', 'model', 'problem', 'algorithm', 'datum', 'learning', 'neural', 'abstract', 'function', 'university']
Topic 3: ['model', 'network', 'neural', 'system', 'algorithm', 'function', 'problem', 'learn', 'learning', 'datum']
Topic 4: ['network', 'model', 'algorithm', 'neural', 'problem', 'function', 'datum', 'learning', 'information', 'e']
Topic 5: ['model', 'network', 'algorithm', 'neural', 'learning', 'input', 'abstract', 'introduction', 'function', 'set']
Topic 6: ['model', 'network', 'problem', 'algorithm', 'system', 'neural', 'base', 'learn', 'approach', 'set']
Topic 7: ['model', 'network', 'method', 'learning', 'neural', 'system', 'function', 'datum', 'algorithm', 'problem']
Topic 8: ['model', 'problem', 'network', 'alg