In [None]:
# based on: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python

In [1]:
import gensim.corpora as corpora
from gensim.models import ldamodel
from gensim.models.coherencemodel import CoherenceModel
import pickle
from tqdm import tqdm
from data import load_doc_tokens, total, model_path, save_file, load_file
from sklearn.utils import resample

In [2]:
tokens = [[t for t in tokens if len(t.split()) <= 2] for tokens in tqdm(load_doc_tokens(), total=total)] #ignore trigrams
tokens = resample(tokens, n_samples=100000)

100%|██████████| 291415/291415 [05:41<00:00, 852.93it/s]


In [3]:
id2word = corpora.Dictionary(tqdm(tokens))

100%|██████████| 100000/100000 [03:23<00:00, 491.94it/s]


In [3]:
# save_file(id2word, 'id2word.pkl')
# save_file(tokens, 'tokens.pkl')
id2word = load_file('id2word.pkl')
tokens = load_file('tokens.pkl')

In [4]:
corpus = [id2word.doc2bow(doc) for doc in tqdm(tokens)]

100%|██████████| 100000/100000 [01:18<00:00, 1271.56it/s]


In [5]:
def compute_coherence_values(dictionary, corpus, texts, num_topics):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    coherence_values = []
    for n in tqdm(num_topics):
        model = ldamodel.LdaModel(corpus=corpus, num_topics=n, id2word=id2word)
        coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence='c_v')
        coherence_value = coherencemodel.get_coherence()
        print(n, coherence_value)
        coherence_values.append(coherence_value)

    return coherence_values

In [6]:
x = [*range(10, 100, 10),*range(100, 301, 50)]
y = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=tokens, num_topics=x)


  7%|▋         | 1/14 [13:24<2:54:12, 804.07s/it]

10 0.6291192318287642


 14%|█▍        | 2/14 [32:21<3:20:02, 1000.22s/it]

20 0.6286451794613528


 21%|██▏       | 3/14 [57:34<3:46:17, 1234.32s/it]

30 0.5695820401162344


 29%|██▊       | 4/14 [1:31:36<4:18:50, 1553.01s/it]

40 0.6226362818051472


 36%|███▌      | 5/14 [2:09:15<4:31:08, 1807.61s/it]

50 0.5850611554904446


 43%|████▎     | 6/14 [2:52:20<4:36:15, 2071.91s/it]

60 0.5735516564408863


 50%|█████     | 7/14 [3:44:51<4:42:52, 2424.70s/it]

70 0.5757096715313936


 57%|█████▋    | 8/14 [4:50:21<4:50:24, 2904.05s/it]

80 0.582384029028718


 64%|██████▍   | 9/14 [6:03:47<4:41:07, 3373.49s/it]

90 0.5800484277940015


 71%|███████▏  | 10/14 [7:11:02<3:58:31, 3577.77s/it]

100 0.5781484312117704


 79%|███████▊  | 11/14 [8:46:28<3:31:45, 4235.16s/it]

150 0.5396395074758393


 86%|████████▌ | 12/14 [10:53:08<2:55:17, 5258.88s/it]

200 0.5194403529806774


 93%|█████████▎| 13/14 [13:29:51<1:48:34, 6514.25s/it]

250 0.5043904605431779


100%|██████████| 14/14 [16:50:18<00:00, 4329.91s/it]

300 0.49672939462426796





In [None]:
import plotly.express as px

In [8]:
print(x, y)

[10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 150, 200, 250, 300] [0.6291192318287642, 0.6286451794613528, 0.5695820401162344, 0.6226362818051472, 0.5850611554904446, 0.5735516564408863, 0.5757096715313936, 0.582384029028718, 0.5800484277940015, 0.5781484312117704, 0.5396395074758393, 0.5194403529806774, 0.5043904605431779, 0.49672939462426796]


In [7]:
pickle.dump(y, open('lday.pkl','wb'))

In [11]:
with open('lda.txt', 'w') as f:
    for x_, y_ in zip(x,y):
        f.write(f"{x_}: {y_}\n")

In [None]:
px.line(zip(x, y))