# LDA Training
<figure>
<div>
<img src=https://s2.loli.net/2022/02/28/X7vzOlDHJtP6UnM.png width="600">
</div>
<figcaption>The LDA training algorithm from <a href=http://www.arbylon.net/publications/text-est.pdf>Parameter estimation for text analysis</a></figcaption>
</figure>

In [2]:
import random
import numpy as np
from collections import defaultdict, OrderedDict
from types import SimpleNamespace
from tqdm.notebook import tqdm
from visualize import visualize_topic_word

In [6]:
# === corpus loading ===
class NeurIPSCorpus:
    def __init__(self, data_path, num_topics, mode, start_doc_idx=0, max_num_docs=100, max_num_words=10000, max_doc_length=1000, train_corpus=None):
        self.docs = []
        self.word2id = OrderedDict()
        self.max_doc_length = max_doc_length
        self.mode = mode

        # only keep the most frequent words
        if self.mode == "train":
            word2cnt = defaultdict(int)
            with open(data_path) as fin:
                for i, line in enumerate(list(fin)[::-1]):  # use more recent papers
                    if i >= max_num_docs: break
                    for word in line.strip().split():
                        word2cnt[word] += 1
            
            word2cnt = sorted(list(word2cnt.items()), key=lambda x: x[1], reverse=True)
            if len(word2cnt) > max_num_words:
                word2cnt = word2cnt[:max_num_words]
            word2cnt = dict(word2cnt)

        # read in the doc and convert words to integers
        with open(data_path) as fin:
            for i, line in enumerate(list(fin)[::-1]):   # use more recent papers
                if i < start_doc_idx: continue
                if i - start_doc_idx >= max_num_docs: break
                doc = []
                for word in line.strip().split():
                    if len(doc) >= self.max_doc_length: break
                    if self.mode == "train":
                        if word not in word2cnt: continue
                        if word not in self.word2id: 
                            self.word2id[word] = len(self.word2id)
                        doc.append(self.word2id[word])
                    else:
                        if word not in train_corpus.word2id: continue
                        doc.append(train_corpus.word2id[word])
                self.docs.append(doc)

        self.num_docs = len(self.docs)
        self.num_topics = num_topics
        self.num_words = len(self.word2id)
        self.id2word = {v: k for k, v in self.word2id.items()}
        print(
            "num_docs:", self.num_docs, 
            "num_topics:", self.num_topics, 
            "num_words:", self.num_words
        )

corpus = NeurIPSCorpus(
    data_path="data/papers.txt", 
    mode="train",
    num_topics=10,
    start_doc_idx=0,
    max_num_docs=1000,
    max_num_words=10000,
    max_doc_length=200,
)
hparams = SimpleNamespace(
    alpha=np.ones([corpus.num_topics], dtype=float) / corpus.num_topics,
    beta = np.ones([corpus.num_words], dtype=float) / corpus.num_topics,
    gibbs_sampling_max_iters=500,
)

num_docs: 1000 num_topics: 10 num_words: 7794


In [4]:
# === initialization ===
print("Initializing...", flush=True)
n_doc_topic = np.zeros([corpus.num_docs, corpus.num_topics], dtype=float) # n_m^(k)
n_topic_word = np.zeros([corpus.num_topics, corpus.num_words], dtype=float) # n_k^(t)
z_doc_word = np.zeros([corpus.num_docs, corpus.max_doc_length], dtype=int)

for doc_i in range(corpus.num_docs):
    for j, word_j in enumerate(corpus.docs[doc_i]):
        topic_ij = random.randint(0, corpus.num_topics - 1)
        n_doc_topic[doc_i, topic_ij] += 1
        n_topic_word[topic_ij, word_j] += 1
        z_doc_word[doc_i, j] = topic_ij

# === Gibbs sampling ===
print("Gibbs sampling...", flush=True)
for iteration in tqdm(range(hparams.gibbs_sampling_max_iters)):
    for doc_i in range(corpus.num_docs):
        for j, word_j in enumerate(corpus.docs[doc_i]):
            # remove the old assignment
            topic_ij = z_doc_word[doc_i, j]
            n_doc_topic[doc_i, topic_ij] -= 1
            n_topic_word[topic_ij, word_j] -= 1
            # compute the new assignment
            p_doc_topic = (n_doc_topic[doc_i, :] + hparams.alpha) \
                        / np.sum(n_doc_topic[doc_i] + hparams.alpha)
            p_topic_word = (n_topic_word[:, word_j] + hparams.beta[word_j]) \
                        / np.sum(n_topic_word + hparams.beta, axis=1)
            p_topic = p_doc_topic * p_topic_word
            p_topic /= np.sum(p_topic)
            # record the new assignment
            new_topic_ij = np.random.choice(np.arange(corpus.num_topics), p=p_topic)
            n_doc_topic[doc_i, new_topic_ij] += 1
            n_topic_word[new_topic_ij, word_j] += 1
            z_doc_word[doc_i, j] = new_topic_ij

    if iteration % 50 == 0:
        print(f"Iter [{iteration}]===")
        # === Check convergence and read out parameters ===
        theta = (n_doc_topic + hparams.alpha) / np.sum(n_doc_topic + hparams.alpha, axis=1, keepdims=True)
        phi = (n_topic_word + hparams.beta) / np.sum(n_topic_word + hparams.beta, axis=1, keepdims=True)

        all_top_words = []
        all_top_probs = []
        for topic in range(corpus.num_topics):
            top_words = np.argsort(phi[topic])[::-1][:10]
            top_probs = phi[topic, top_words]
            top_words = [corpus.id2word[word] for word in top_words]
            all_top_words.append(top_words)
            all_top_probs.append(top_probs)
            print(f"Topic {topic}:", top_words)
        visualize_topic_word(all_top_words, all_top_probs)

Initializing...
Gibbs sampling...


HBox(children=(HTML(value=''), FloatProgress(value=0.0, max=500.0), HTML(value='')))

Iter [0]===
Topic 0: ['model', 'network', 'learning', 'problem', 'learn', 'function', 'algorithm', 'university', 'datum', 'method']
Topic 1: ['network', 'learning', 'model', 'problem', 'learn', 'method', 'algorithm', 'datum', 'introduction', 'task']
Topic 2: ['model', 'network', 'learning', 'datum', 'problem', 'algorithm', 'neural', 'university', 'edu', 'function']
Topic 3: ['model', 'problem', 'learn', 'network', 'learning', 'university', 'algorithm', 'abstract', 'datum', 'edu']
Topic 4: ['model', 'network', 'learning', 'problem', 'university', 'algorithm', 'learn', 'abstract', 'function', 'method']
Topic 5: ['model', 'neural', 'network', 'function', 'algorithm', 'problem', 'learning', 'learn', 'university', 'edu']
Topic 6: ['model', 'datum', 'network', 'neural', 'learning', 'method', 'university', 'learn', 'algorithm', 'problem']
Topic 7: ['network', 'model', 'algorithm', 'problem', 'learning', 'datum', 'function', 'university', 'introduction', 'task']
Topic 8: ['network', 'model', '

Iter [50]===
Topic 0: ['model', 'task', 'learning', 'deep', 'learn', 'network', 'neural', 'image', 'train', 'method']
Topic 1: ['datum', 'algorithm', 'learning', 'learn', 'problem', 'sample', 'method', 'set', 'training', 'class']
Topic 2: ['information', 'university', 'brain', 'analysis', 'datum', 'memory', 'structure', 'department', 'neural', 'e']
Topic 3: ['model', 'distribution', 'datum', 'gaussian', 'kernel', 'gan', 'generative', 'process', 'university', 'variable']
Topic 4: ['network', 'neural', 'input', 'learn', 'unit', 'output', 'model', 'training', 'weight', 'state']
Topic 5: ['neuron', 'network', 'model', 'neural', 'system', 'time', 'cell', 't', 'pattern', 'signal']
Topic 6: ['x', 'n', 'function', 'problem', 'algorithm', 'k', 'method', 'matrix', 't', 'optimization']
Topic 7: ['algorithm', 'problem', 'learning', 'policy', 'reinforcement', 'function', 'state', 'agent', 'learn', 'action']
Topic 8: ['datum', 'problem', 'cluster', 'network', 'user', 'algorithm', 'graph', 'clusterin

Iter [100]===
Topic 0: ['model', 'deep', 'task', 'learning', 'network', 'learn', 'image', 'neural', 'train', 'representation']
Topic 1: ['datum', 'learning', 'algorithm', 'problem', 'learn', 'sample', 'method', 'training', 'function', 'label']
Topic 2: ['university', 'brain', 'neural', 'information', 'structure', 'department', 'method', 'analysis', 'datum', 'approach']
Topic 3: ['model', 'distribution', 'inference', 'datum', 'gaussian', 'variable', 'process', 'gan', 'sample', 'generative']
Topic 4: ['network', 'neural', 'input', 'output', 'learn', 'unit', 'weight', 'training', 'system', 'error']
Topic 5: ['neuron', 'network', 'neural', 'model', 'system', 'cell', 'time', 'dynamic', 'analog', 'activity']
Topic 6: ['x', 'problem', 'n', 'algorithm', 'function', 'k', 't', 'gradient', 'optimization', 'matrix']
Topic 7: ['learning', 'algorithm', 'problem', 'policy', 'reinforcement', 'learn', 'agent', 'function', 'action', 'state']
Topic 8: ['datum', 'graph', 'cluster', 'node', 'algorithm', 'u

Iter [150]===
Topic 0: ['network', 'model', 'task', 'deep', 'learning', 'learn', 'neural', 'image', 'datum', 'train']
Topic 1: ['datum', 'learning', 'algorithm', 'problem', 'learn', 'sample', 'method', 'set', 'label', 'example']
Topic 2: ['university', 'brain', 'neural', 'model', 'information', 'department', 'analysis', 'approach', 'datum', 'correlation']
Topic 3: ['model', 'distribution', 'inference', 'gaussian', 'process', 'datum', 'gan', 'sample', 'variable', 'generative']
Topic 4: ['network', 'neural', 'input', 'learn', 'output', 'unit', 'weight', 'training', 'function', 'algorithm']
Topic 5: ['neuron', 'network', 'neural', 'model', 'system', 'cell', 'time', 'dynamic', 'analog', 'simulation']
Topic 6: ['x', 'problem', 'n', 'algorithm', 'function', 't', 'optimization', 'k', 'matrix', 'method']
Topic 7: ['learning', 'algorithm', 'policy', 'reinforcement', 'learn', 'problem', 'state', 'agent', 'action', 'game']
Topic 8: ['datum', 'graph', 'node', 'cluster', 'algorithm', 'problem', 'cl

Iter [200]===
Topic 0: ['network', 'deep', 'task', 'model', 'learning', 'learn', 'neural', 'image', 'representation', 'method']
Topic 1: ['datum', 'learning', 'problem', 'learn', 'algorithm', 'set', 'training', 'method', 'sample', 'label']
Topic 2: ['university', 'model', 'brain', 'neural', 'analysis', 'department', 'datum', 'information', 'edu', 'structure']
Topic 3: ['model', 'distribution', 'inference', 'gaussian', 'datum', 'sample', 'gan', 'generative', 'process', 'variational']
Topic 4: ['network', 'neural', 'input', 'unit', 'weight', 'learn', 'output', 'function', 'training', 'algorithm']
Topic 5: ['neuron', 'network', 'model', 'neural', 'system', 'cell', 'analog', 'abstract', 'control', 'time']
Topic 6: ['x', 'n', 'problem', 'function', 'algorithm', 'optimization', 't', 'k', 'matrix', 'gradient']
Topic 7: ['learning', 'policy', 'algorithm', 'reinforcement', 'problem', 'learn', 'agent', 'action', 'reward', 'control']
Topic 8: ['algorithm', 'datum', 'graph', 'cluster', 'problem', 

Iter [250]===
Topic 0: ['network', 'deep', 'model', 'task', 'learn', 'learning', 'neural', 'image', 'method', 'representation']
Topic 1: ['datum', 'learning', 'problem', 'learn', 'algorithm', 'sample', 'label', 'method', 'example', 'training']
Topic 2: ['university', 'model', 'neural', 'brain', 'analysis', 'department', 'science', 'information', 'method', 'time']
Topic 3: ['model', 'distribution', 'inference', 'datum', 'process', 'gaussian', 'sample', 'generative', 'gan', 'variable']
Topic 4: ['network', 'neural', 'input', 'unit', 'weight', 'learn', 'output', 'function', 'training', 'state']
Topic 5: ['neuron', 'network', 'model', 'neural', 'system', 'cell', 'analog', 'study', 'pattern', 'activity']
Topic 6: ['x', 'problem', 'n', 'function', 'algorithm', 't', 'k', 'optimization', 'matrix', 'method']
Topic 7: ['learning', 'algorithm', 'policy', 'reinforcement', 'problem', 'learn', 'agent', 'game', 'state', 'action']
Topic 8: ['algorithm', 'datum', 'graph', 'problem', 'cluster', 'node', 

Iter [300]===
Topic 0: ['model', 'network', 'deep', 'learn', 'task', 'learning', 'neural', 'image', 'datum', 'train']
Topic 1: ['datum', 'learning', 'algorithm', 'learn', 'problem', 'sample', 'function', 'method', 'example', 'label']
Topic 2: ['model', 'university', 'brain', 'department', 'neural', 'analysis', 'datum', 'time', 'information', 'edu']
Topic 3: ['model', 'distribution', 'inference', 'datum', 'generative', 'gaussian', 'gan', 'process', 'variable', 'variational']
Topic 4: ['network', 'neural', 'input', 'learn', 'output', 'weight', 'unit', 'algorithm', 'training', 'function']
Topic 5: ['neuron', 'neural', 'network', 'system', 'model', 'cell', 'analog', 't', 'activity', 'pattern']
Topic 6: ['x', 'problem', 'n', 'function', 'algorithm', 't', 'optimization', 'matrix', 'method', 'k']
Topic 7: ['learning', 'policy', 'algorithm', 'reinforcement', 'problem', 'learn', 'state', 'agent', 'control', 'game']
Topic 8: ['algorithm', 'datum', 'graph', 'cluster', 'node', 'model', 'problem', 

Iter [350]===
Topic 0: ['network', 'deep', 'model', 'task', 'learn', 'learning', 'neural', 'image', 'train', 'representation']
Topic 1: ['datum', 'learning', 'algorithm', 'problem', 'learn', 'sample', 'function', 'training', 'label', 'set']
Topic 2: ['university', 'model', 'analysis', 'department', 'brain', 'time', 'datum', 'edu', 'neural', 'prediction']
Topic 3: ['model', 'distribution', 'inference', 'datum', 'sample', 'generative', 'gaussian', 'gan', 'process', 'variational']
Topic 4: ['network', 'neural', 'input', 'weight', 'unit', 'learn', 'output', 'training', 'function', 'error']
Topic 5: ['neuron', 'network', 'neural', 'system', 'model', 'cell', 'analog', 'pattern', 'study', 'information']
Topic 6: ['x', 'problem', 'n', 'function', 'algorithm', 't', 'method', 'k', 'matrix', 'optimization']
Topic 7: ['learning', 'algorithm', 'policy', 'reinforcement', 'problem', 'learn', 'state', 'agent', 'control', 'action']
Topic 8: ['algorithm', 'graph', 'cluster', 'problem', 'datum', 'node', 

Iter [400]===
Topic 0: ['deep', 'network', 'model', 'learning', 'task', 'learn', 'image', 'neural', 'method', 'datum']
Topic 1: ['datum', 'learning', 'algorithm', 'problem', 'learn', 'sample', 'label', 'example', 'class', 'training']
Topic 2: ['model', 'university', 'time', 'department', 'analysis', 'neural', 'brain', 'edu', 'datum', 'structure']
Topic 3: ['model', 'distribution', 'inference', 'datum', 'generative', 'gaussian', 'sample', 'gan', 'process', 'university']
Topic 4: ['network', 'neural', 'input', 'output', 'weight', 'unit', 'learn', 'training', 'function', 'state']
Topic 5: ['neuron', 'neural', 'network', 'model', 'cell', 'system', 'analog', 'information', 'abstract', 'pattern']
Topic 6: ['x', 'problem', 'n', 'function', 'algorithm', 'optimization', 't', 'method', 'matrix', 'k']
Topic 7: ['learning', 'policy', 'algorithm', 'reinforcement', 'problem', 'learn', 'agent', 'state', 'action', 'game']
Topic 8: ['algorithm', 'datum', 'graph', 'cluster', 'problem', 'node', 'model', 

Iter [450]===
Topic 0: ['task', 'network', 'deep', 'model', 'learn', 'learning', 'image', 'neural', 'training', 'train']
Topic 1: ['datum', 'learning', 'algorithm', 'learn', 'problem', 'model', 'function', 'label', 'sample', 'method']
Topic 2: ['model', 'university', 'datum', 'department', 'time', 'analysis', 'brain', 'neural', 'edu', 'system']
Topic 3: ['model', 'distribution', 'inference', 'datum', 'sample', 'generative', 'gan', 'gaussian', 'variable', 'variational']
Topic 4: ['network', 'neural', 'input', 'unit', 'weight', 'learn', 'output', 'training', 'state', 'function']
Topic 5: ['neuron', 'model', 'network', 'cell', 'system', 'neural', 'analog', 'spike', 'activity', 'information']
Topic 6: ['x', 'problem', 'n', 'function', 'algorithm', 'optimization', 'k', 'matrix', 't', 'edu']
Topic 7: ['learning', 'algorithm', 'policy', 'reinforcement', 'problem', 'learn', 'agent', 'state', 'action', 'control']
Topic 8: ['algorithm', 'datum', 'graph', 'problem', 'cluster', 'node', 'edu', 'dis




# Inference on unseen documents

In [9]:
# === inference on unseen documents ===
test_corpus = NeurIPSCorpus(
    data_path="data/papers.txt", 
    mode="test",
    num_topics=10,
    start_doc_idx=1000,
    max_num_docs=5,
    max_num_words=10000,
    max_doc_length=200,
    train_corpus=corpus,
)
# === inference via Gibbs sampling ===
for i, doc in enumerate(test_corpus.docs):
    print(f"\nTest Doc [{i}] ===")
    doc_i = 0   # only infer 1 test doc at a time
    test_n_doc_topic = np.zeros([1, corpus.num_topics], dtype=float)
    test_n_topic_word = np.zeros([corpus.num_topics, corpus.num_words], dtype=float)
    test_z_doc_word = np.zeros([1, corpus.max_doc_length], dtype=int)

    print(" ".join([corpus.id2word[x] for x in doc]))

    for j, word_j in enumerate(doc):
        topic_ij = random.randint(0, corpus.num_topics - 1)
        test_n_doc_topic[doc_i, topic_ij] += 1
        test_n_topic_word[topic_ij, word_j] += 1
        test_z_doc_word[doc_i, j] = topic_ij

    for iteration in tqdm(range(100)):
        for j, word_j in enumerate(doc):
            # remove the old assignment
            topic_ij = test_z_doc_word[doc_i, j]
            test_n_doc_topic[doc_i, topic_ij] -= 1
            test_n_topic_word[topic_ij, word_j] -= 1
            # compute the new assignment (new sampling formula!)
            p_doc_topic = (test_n_doc_topic[doc_i, :] + hparams.alpha) \
                        / np.sum(test_n_doc_topic[doc_i] + hparams.alpha)
            p_topic_word = (test_n_topic_word[:, word_j] + n_topic_word[:, word_j] + hparams.beta[word_j]) \
                        / np.sum(test_n_topic_word + n_topic_word + hparams.beta, axis=1)
            p_topic = p_doc_topic * p_topic_word
            p_topic /= np.sum(p_topic)
            # record the new assignment
            new_topic_ij = np.random.choice(np.arange(corpus.num_topics), p=p_topic)
            test_n_doc_topic[doc_i, new_topic_ij] += 1
            test_n_topic_word[new_topic_ij, word_j] += 1
            test_z_doc_word[doc_i, j] = new_topic_ij

    # === Check convergence and read out parameters ===
    test_theta = (test_n_doc_topic + hparams.alpha) / np.sum(test_n_doc_topic + hparams.alpha, axis=1, keepdims=True)
    test_phi = (test_n_topic_word + hparams.beta) / np.sum(test_n_topic_word + hparams.beta, axis=1, keepdims=True)
    print("Topic distribution:", [float(f"{x:.4f}") for x in test_theta[0]])
    print("Top 3 topics:", np.argsort(test_theta[0])[::-1][:3])

num_docs: 5 num_topics: 10 num_words: 0

Test Doc [0] ===
inference graphical models semidefinite programming a microsoft research cs toronto edu mit microsoft research mit edu andrea montanari stanford university montanari stanford edu abstract maximum posteriori probability map inference graphical model amount solve graph structure combinatorial optimization problem popular inference algorithm belief propagation bp generalize belief propagation intimately related linear programming lp relaxation adams hierarchy despite popularity algorithm understand sum square hierarchy base semidefinite programming provide superior guarantee unfortunately relaxation graph n vertex require solve n d variable d degree hierarchy practice d approach scale ten variable paper propose binary relaxation map inference hierarchy innovation focus computational efficiency firstly analogy bp variant introduce decision variable correspond region graphical model secondly solve result non convex style method devel

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


Topic distribution: [0.0403, 0.0303, 0.1448, 0.3537, 0.0005, 0.0005, 0.2692, 0.0005, 0.1597, 0.0005]
Top 3 topics: [3 6 8]

Test Doc [1] ===
pose person image generation ma xu jia sun luc van gool ku psi trace europe ku psi max planck institute informatics informatics campus eth zurich ma xu jia luc mpi inf mpg de vision ee ethz ch abstract paper propose novel pose person generation network pg allow synthesize person image arbitrary pose base image person novel pose generation framework pg utilize pose information explicitly consist key stage pose integration image refinement stage condition image target pose feed u net like network generate initial coarse image person target pose stage refine initial result train u net like generator adversarial way extensive experimental result identification image fashion photo model generate high quality person image detail introduction generate realistic look image great value application face movie making image retrieval base image consequently 

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


Topic distribution: [0.5279, 0.0055, 0.0353, 0.1647, 0.0254, 0.0005, 0.0005, 0.0005, 0.0005, 0.2393]
Top 3 topics: [0 9 3]

Test Doc [2] ===
model shrinkage effect gamma process edge partition models japan univ tokyo japan univ japan jp com k u tokyo ac jp ist ac jp abstract edge partition model fundamental bayesian nonparametric model extract overlap structure binary matrix adopt gamma process p prior automatically number active atom empirically find model shrinkage typically work appropriately lead solution analysis expectation s intensity function suggest gamma prior hyperparameter model shrinkage effect internal p order ensure model shrinkage effect work appropriate manner propose novel generative construction incorporate constrain gamma prior incorporate dirichlet prior instead gamma prior furthermore s model parameter include infinite atom p prior marginalize possible derive truly infinite efficiently infer collapse gibbs sampler experimentally confirm model shrinkage propose mo

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


Topic distribution: [0.0005, 0.1746, 0.1299, 0.4532, 0.0005, 0.0104, 0.09, 0.0005, 0.1398, 0.0005]
Top 3 topics: [3 1 8]

Test Doc [3] ===
universal style transfer feature li uc edu wang adobe research adobe com chen fang adobe research adobe com xin lu adobe research adobe com yang adobe research adobe com ming yang uc nvidia research edu abstract universal style transfer aim transfer arbitrary visual style content image exist feed forward base method enjoy inference efficiency mainly limit generalize unseen style compromise visual quality paper present simple effective method tackle limitation training pre defined style key ingredient method pair feature transform coloring embed image reconstruction network coloring transform reflect direct matching feature covariance content image give style image share similar spirit optimization gram matrix base cost neural style transfer demonstrate effectiveness algorithm generate high quality image comparison number recent method analyze metho

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


Topic distribution: [0.7667, 0.0005, 0.0154, 0.0005, 0.0005, 0.0005, 0.1249, 0.0005, 0.0005, 0.09]
Top 3 topics: [0 6 9]

Test Doc [4] ===
phase transition data problem jonathan cevher laboratory information inference systems cole polytechnique f d de epfl jonathan cevher epfl ch abstract paper study pool data problem identify label associate large collection item base sequence pool test reveal count label pool noiseless setting identify exact asymptotic threshold required number test optimal decoding prove phase transition complete success complete failure addition present novel noisy variation problem provide information theoretic framework characterize required number test general random noise model result reveal noise problem considerably difficult strict increase scaling law low noise level finally demonstrate similar behavior approximate recovery setting give number error allow label introduction consider follow setting exist large population item associated label label initiall

HBox(children=(HTML(value=''), FloatProgress(value=0.0), HTML(value='')))


Topic distribution: [0.0055, 0.503, 0.0204, 0.0005, 0.0005, 0.0005, 0.2343, 0.1199, 0.1149, 0.0005]
Top 3 topics: [1 6 7]


Compare with the learned topics: 

<img src=https://raw.githubusercontent.com/mistylight/picbed/main/Hexo/Screen%20Shot%202022-02-28%20at%205.10.12%20PM.png style="width: 1200px">