In [None]:
# based on: https://www.machinelearningplus.com/nlp/topic-modeling-gensim-python

In [1]:
import gensim.corpora as corpora
from gensim.models import ldamodel, ldamulticore
from gensim.models.coherencemodel import CoherenceModel
import pickle
from tqdm import tqdm
from data import load_doc_tokens, total, model_path, save_file, load_file
from sklearn.utils import resample

In [2]:
tokens = [[t for t in tokens if len(t.split()) <= 2] for tokens in tqdm(load_doc_tokens(), total=total)] #ignore trigrams
tokens = resample(tokens, n_samples=100000)

100%|██████████| 291415/291415 [05:41<00:00, 852.93it/s]


In [3]:
id2word = corpora.Dictionary(tqdm(tokens))

100%|██████████| 100000/100000 [03:23<00:00, 491.94it/s]


In [2]:
# save_file(id2word, 'id2word.pkl')
# save_file(tokens, 'tokens.pkl')
id2word = load_file('id2word.pkl')
tokens = load_file('tokens.pkl')

In [3]:
corpus = [id2word.doc2bow(doc) for doc in tqdm(tokens)]

100%|██████████| 100000/100000 [01:07<00:00, 1475.84it/s]


In [4]:
def compute_coherence_values(dictionary, corpus, texts, num_topics):
    """
    Compute c_v coherence for various number of topics

    Parameters:
    ----------
    dictionary : Gensim dictionary
    corpus : Gensim corpus
    texts : List of input texts
    limit : Max num of topics

    Returns:
    -------
    coherence_values : Coherence values corresponding to the LDA model with respective number of topics
    """
    c_vals = {
        'u_mass': [],
        'c_uci': [],
        'c_npmi': [],
        'c_v': []
    }
    for n in tqdm(num_topics):
        model = ldamulticore.LdaMulticore(corpus=corpus, num_topics=n, id2word=id2word, workers=3)
        for c in ['u_mass','c_uci','c_npmi','c_v']:
            coherencemodel = CoherenceModel(model=model, texts=texts, dictionary=dictionary, coherence=c)
            coherence_value = coherencemodel.get_coherence()
            print(n, c, coherence_value)
            c_vals[c].append(coherence_value)

    return c_vals

In [5]:
x = range(250, 301, 50)
y_vals = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=tokens, num_topics=x)

  0%|          | 0/2 [2:31:59<?, ?it/s]Process ForkPoolWorker-4:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multiprocessing/process.py", line 108, in run
    self._target(*self._args, **self._kwargs)
  File "/usr/lib/python3.8/multiprocessing/pool.py", line 109, in worker
    initializer(*initargs)
  File "/home/marcin/.local/lib/python3.8/site-packages/gensim/models/ldamulticore.py", line 334, in worker_e_step
    chunk_no, chunk, worker_lda = input_queue.get()
  File "/usr/lib/python3.8/multiprocessing/queues.py", line 96, in get
    with self._rlock:
  File "/usr/lib/python3.8/multiprocessing/synchronize.py", line 95, in __enter__
    return self._semlock.__enter__()
KeyboardInterrupt

Process ForkPoolWorker-1:
Traceback (most recent call last):
  File "/usr/lib/python3.8/multiprocessing/process.py", line 315, in _bootstrap
    self.run()
  File "/usr/lib/python3.8/multi

KeyboardInterrupt: 

In [5]:
x = [*range(90, 100, 10),*range(100, 301, 50)]
y_vals = compute_coherence_values(dictionary=id2word, corpus=corpus, texts=tokens, num_topics=x)


  0%|          | 0/6 [00:00<?, ?it/s]90 u_mass -1.13247680717946
90 c_uci -0.04050871586814566
90 c_npmi 0.04971644530890303
 17%|█▋        | 1/6 [48:03<4:00:17, 2883.50s/it]90 c_v 0.6163967282668963
100 u_mass -1.3661977451735094
100 c_uci -0.34791408345849795
100 c_npmi 0.036760574789024794
 33%|███▎      | 2/6 [1:29:30<2:56:40, 2650.22s/it]100 c_v 0.5942532001817823


In [None]:
pickle.dump(zip(x,y_vals['u_mass']), open('lday-umass.pkl','wb'))
pickle.dump(zip(x,y_vals['c_uci']), open('lday-uci.pkl','wb'))
pickle.dump(zip(x,y_vals['c_npmi']), open('lday-npmi.pkl','wb'))

In [11]:
scores = {
    'u_mass': [],
    'c_uci': [],
    'c_npmi': []
}

with open('lda_10-80.txt','r') as f:
    for line in f.readlines():
        n, c, s = line.strip().split()
        scores[c].append((n,s))

scores