In [None]:
# based on: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python

In [1]:
import gensim.corpora as corpora
from gensim.models import ldamodel
from gensim.models.coherencemodel import CoherenceModel
import pickle
from tqdm import tqdm
from data import load_doc_tokens, total, model_path

In [None]:
tokens = list(load_doc_tokens())

In [None]:
id2word = corpora.Dictionary(tqdm(tokens))

In [9]:
corpus = [id2word.doc2bow(doc) for doc in tqdm(tokens)]

100%|██████████| 291415/291415 [02:34<00:00, 1886.86it/s]


In [None]:
def compute_coherence_values(dictionary, corpus, texts, limit, start=2, step=3):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    for num_topics in range(start, limit, step):
        model = ldamodel(corpus=corpus, num_topics=num_topics, id2word=id2word)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_values.append(coherencemodel.get_coherence())

    return coherence_values

In [None]:
limit, start, step = 100, 10, 10
x = range(start, limit, step)
y = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=tokens, start=10, limit=100, step=10)


In [None]:
import plotly.express as px

In [None]:
px.line(zip(x, y))