In [1]:
import os
import logging
import gensim
from gensim.models import LdaMulticore
from gensim import corpora, utils
from gensim.corpora import Dictionary
from gensim.test.utils import datapath
from multiprocessing import Process, freeze_support

In [2]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
logging.root.setLevel(level=logging.DEBUG)

In [3]:
# loading dictionary
dct = corpora.Dictionary.load('dictionary.dict')

# loading corpus
corpus = corpora.MmCorpus('mycorpus.mm')

2020-11-19 16:23:17,025 : INFO : loading Dictionary object from dictionary.dict
2020-11-19 16:23:17,027 : DEBUG : {'uri': 'dictionary.dict', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-19 16:23:17,037 : INFO : loaded dictionary.dict
2020-11-19 16:23:17,038 : DEBUG : {'uri': 'mycorpus.mm.index', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-19 16:23:17,043 : INFO : loaded corpus index from mycorpus.mm.index
2020-11-19 16:23:17,043 : INFO : initializing cython corpus reader from mycorpus.mm
2020-11-19 16:23:17,044 : DEBUG : {'uri': 'mycorpus.mm', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-19 16:23:17,044 : INFO : acce

In [4]:
workers = 5
passes = 15
chunksize = 50000
num_topics = 500
iterations = 200
eval_every = None

# train and save model
if __name__ == '__main__':
    freeze_support()
    lda_model = LdaMulticore(corpus=corpus, 
                             id2word=dct, 
                             num_topics=num_topics, 
                             workers=workers,
                             chunksize=chunksize, 
                             passes=passes, 
                             iterations=iterations, 
                             eval_every=eval_every)
    
    temp_file = datapath('lda_model')
    lda_model.save(temp_file)

2020-11-19 17:18:26,664 : INFO : using symmetric alpha at 0.002
2020-11-19 17:18:26,665 : INFO : using symmetric eta at 0.002
2020-11-19 17:18:26,669 : INFO : using serial LDA version on this node
2020-11-19 17:18:27,823 : INFO : running online LDA training, 500 topics, 15 passes over the supplied corpus of 49582 documents, updating every 250000 documents, evaluating every ~0 documents, iterating 200x with a convergence threshold of 0.001000
2020-11-19 17:18:27,825 : INFO : training LDA model using 5 processes
2020-11-19 17:18:27,838 : DEBUG : {'uri': 'mycorpus.mm', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-19 17:18:31,926 : INFO : PROGRESS: pass 0, dispatched chunk #0 = documents up to #49582/49582, outstanding queue size 1
2020-11-19 17:24:18,901 : DEBUG : updating topics
2020-11-19 17:24:20,634 : INFO : topic #295 (0.002): 0.019*"film" + 0.008*"one" + 0.008

2020-11-19 17:34:41,860 : DEBUG : updating topics
2020-11-19 17:34:43,640 : INFO : topic #448 (0.002): 0.016*"film" + 0.011*"charact" + 0.007*"one" + 0.005*"stori" + 0.005*"like" + 0.004*"great" + 0.004*"act" + 0.004*"realli" + 0.004*"make" + 0.004*"show"
2020-11-19 17:34:43,641 : INFO : topic #92 (0.002): 0.023*"film" + 0.010*"one" + 0.006*"get" + 0.005*"play" + 0.005*"make" + 0.005*"charact" + 0.005*"see" + 0.005*"way" + 0.005*"like" + 0.005*"watch"
2020-11-19 17:34:43,643 : INFO : topic #68 (0.002): 0.017*"film" + 0.009*"like" + 0.008*"one" + 0.008*"option" + 0.006*"stori" + 0.006*"life" + 0.006*"watch" + 0.006*"charact" + 0.005*"love" + 0.005*"play"
2020-11-19 17:34:43,643 : INFO : topic #417 (0.002): 0.022*"film" + 0.015*"one" + 0.013*"mike" + 0.012*"elvi" + 0.006*"time" + 0.006*"good" + 0.006*"get" + 0.005*"made" + 0.005*"stori" + 0.005*"much"
2020-11-19 17:34:43,644 : INFO : topic #148 (0.002): 0.011*"like" + 0.010*"show" + 0.009*"get" + 0.009*"one" + 0.008*"make" + 0.008*"watch

2020-11-19 17:44:30,101 : INFO : topic #303 (0.002): 0.022*"doc" + 0.015*"diamond" + 0.012*"one" + 0.011*"skate" + 0.009*"film" + 0.009*"realli" + 0.008*"good" + 0.007*"watch" + 0.007*"see" + 0.007*"get"
2020-11-19 17:44:30,138 : INFO : topic diff=2.575583, rho=0.288776
2020-11-19 17:44:30,149 : DEBUG : {'uri': 'mycorpus.mm', 'mode': 'rb', 'buffering': -1, 'encoding': None, 'errors': None, 'newline': None, 'closefd': True, 'opener': None, 'ignore_ext': False, 'transport_params': None}
2020-11-19 17:44:34,346 : INFO : PROGRESS: pass 11, dispatched chunk #0 = documents up to #49582/49582, outstanding queue size 1
2020-11-19 17:46:21,734 : DEBUG : updating topics
2020-11-19 17:46:23,540 : INFO : topic #233 (0.002): 0.038*"space" + 0.016*"critter" + 0.016*"film" + 0.008*"realli" + 0.008*"one" + 0.008*"ship" + 0.008*"like" + 0.007*"make" + 0.006*"get" + 0.006*"time"
2020-11-19 17:46:23,541 : INFO : topic #344 (0.002): 0.023*"film" + 0.011*"time" + 0.009*"kathi" + 0.009*"bate" + 0.008*"one" 