# LDA Training
<figure>
<img src=https://s2.loli.net/2022/02/28/X7vzOlDHJtP6UnM.png style="width: 600px">
<figcaption>The LDA training algorithm from <a href=http://www.arbylon.net/publications/text-est.pdf>Parameter estimation for text analysis</a></figcaption>
</figure>

In [36]:
%load_ext cython
import random
from collections import defaultdict, OrderedDict
from types import SimpleNamespace
from tqdm.notebook import tqdm

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


In [52]:
# === corpus loading ===
class NeurIPSCorpus:
    def __init__(self, data_path, num_topics, max_num_words=10000, max_doc_length=1000):
        self.docs = []
        self.word2id = OrderedDict()

        word2cnt = defaultdict(int)
        with open(data_path) as fin:
            for line in fin:
                for word in line.strip().split():
                    word2cnt[word] += 1
        
        word2cnt = sorted(list(word2cnt.items()), key=lambda x: x[1], reverse=True)
        if len(word2cnt) > max_num_words:
            word2cnt = word2cnt[:max_num_words]
        word2cnt = dict(word2cnt)

        with open(data_path) as fin:
            for line in fin:
                doc = []
                for word in line.strip().split():
                    if len(doc) >= max_doc_length: break
                    if word not in word2cnt: continue
                    if word not in self.word2id: 
                        self.word2id[word] = len(self.word2id)
                    doc.append(self.word2id[word])
                self.docs.append(doc)

        self.num_docs = len(self.docs)
        self.num_topics = num_topics
        self.num_words = len(self.word2id)
        self.id2word = {v: k for k, v in self.word2id.items()}
        print(
            "num_docs:", self.num_docs, 
            "num_topics:", self.num_topics, 
            "num_words:", self.num_words
        )
        import json
        state_dict = {
            "docs": self.docs,
            "word2id": self.word2id,
            "num_topics": self.num_topics,
        }
        json.dump(state_dict, open("data/papers.json", "w"))

corpus = NeurIPSCorpus(
    data_path="data/papers.txt", 
    num_topics=10,
    max_num_words=5000,
    max_doc_length=1000,
)
hparams = SimpleNamespace(
    alpha=np.ones([corpus.num_topics], dtype=float) / corpus.num_topics,
    beta = np.ones([corpus.num_words], dtype=float) / corpus.num_topics,
    gibbs_sampling_max_iters=10,
)

num_docs: 7241 num_topics: 10 num_words: 4989


In [60]:
%%cython
import numpy as np
cimport numpy as np
import json
import random
from collections import defaultdict, OrderedDict
from types import SimpleNamespace
from tqdm.notebook import tqdm


# === corpus loading ===
cdef class NeurIPSCorpus:
    cdef list docs
    cdef dict word2id
    cdef int num_docs
    cdef int num_topics
    cdef int num_words

    def __init__(self, state_dict_path):
        cdef dict state_dict = json.load(open(state_dict_path))
        self.docs = state_dict["docs"]
        self.word2id = state_dict["word2id"]
        self.num_topics = state_dict["num_topics"]
        self.num_docs = len(self.docs)
        self.num_words = len(self.word2id)
        print(
            "num_docs:", self.num_docs, 
            "num_topics:", self.num_topics, 
            "num_words:", self.num_words
        )

cdef class Hyperparameters:
    cdef np.ndarray alpha
    cdef np.ndarray beta
    cdef int gibbs_sampling_max_iters
    
    def __init__(self, alpha, beta, gibbs_sampling_max_iters):
        self.alpha = alpha
        self.beta = beta 
        self.gibbs_sampling_max_iters = gibbs_sampling_max_iters

cdef NeurIPSCorpus corpus = NeurIPSCorpus("data/papers.json")
cdef Hyperparameters hparams = Hyperparameters(
    alpha=np.ones([corpus.num_topics], dtype=float) / corpus.num_topics,
    beta=np.ones([corpus.num_words], dtype=float) / corpus.num_topics,
    gibbs_sampling_max_iters=10,
)

In [None]:

# === initialization ===
print("Initializing...", flush=True)
cdef np.ndarray n_doc_topic = np.zeros([corpus.num_docs, corpus.num_topics], dtype=float) # n_m^(k)
cdef np.ndarray n_topic_word = np.zeros([corpus.num_topics, corpus.num_words], dtype=float) # n_k^(t)
cdef np.ndarray z_doc_word = np.zeroes([corpus.num_docs, corpus.max_doc_length], dtype=int)

cdef int topic_ij
for doc_i in tqdm(range(corpus.num_docs)):
    for j, word_j in enumerate(corpus.docs[doc_i]):
        topic_ij = random.randint(0, corpus.num_topics - 1)
        n_doc_topic[doc_i, topic_ij] += 1
        n_topic_word[topic_ij, word_j] += 1
        z_doc_word[doc_i, j] = topic_ij

# === Gibbs sampling ===
print("Gibbs sampling...", flush=True)
for iteration in range(hparams.gibbs_sampling_max_iters):
    for doc_i in tqdm(range(corpus.num_docs)):
        for j, word_j in enumerate(corpus.docs[doc_i]):
            # remove the old assignment
            topic_ij = z_doc_word[doc_i, j]
            n_doc_topic[doc_i, topic_ij] -= 1
            n_topic_word[topic_ij, word_j] -= 1
            # compute the new assignment
            p_doc_topic = (n_doc_topic[doc_i, :] + hparams.alpha) \
                        / np.sum(n_doc_topic[doc_i] + hparams.alpha)
            p_topic_word = (n_topic_word[:, word_j] + hparams.beta[word_j]) \
                        / np.sum(n_topic_word + hparams.beta, axis=1)
            p_topic = p_doc_topic * p_topic_word
            p_topic /= np.sum(p_topic)
            # record the new assignment
            new_topic_ij = np.random.choice(np.arange(corpus.num_topics), p=p_topic)
            n_doc_topic[doc_i, new_topic_ij] += 1
            n_topic_word[new_topic_ij, word_j] += 1
            z_doc_word[doc_i, j] = new_topic_ij

    # === Check convergence and read out parameters ===
    print(f"Iter [{iteration}] ===", flush=True)
    theta = (n_doc_topic + hparams.alpha) / np.sum(n_doc_topic + hparams.alpha, axis=1, keepdims=True)
    phi = (n_topic_word + hparams.beta) / np.sum(n_topic_word + hparams.beta, axis=1, keepdims=True)
    print("theta:", theta, "phi:", phi, flush=True)