Skip to content

Commit

Permalink
Slightly re-arranged code in Gensim LDA trainer
Browse files Browse the repository at this point in the history
  • Loading branch information
markgw committed Mar 26, 2020
1 parent 9b40874 commit 036a286
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 9 deletions.
11 changes: 2 additions & 9 deletions src/python/pimlico/modules/gensim/lda/execute.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
from gensim.models import LdaModel, TfidfModel

from pimlico.core.modules.base import BaseModuleExecutor
from pimlico.modules.gensim.utils import GensimCorpus
from pimlico.modules.gensim.utils import GensimCorpus, init_gensim_train_logging
from pimlico.utils.progress import get_progress_bar


Expand All @@ -26,14 +26,7 @@ def execute(self):
).encode("utf-8"))
ignore_ids = [vocab.token2id[term] for term in ignore_terms]

# Set up logging, so that we see Gensim's progress as it trains
lda_logger = getLogger('gensim.models.ldamodel')
hnd = logging.StreamHandler()
hnd.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s - Gensim - %(levelname)s - %(message)s')
hnd.setFormatter(fmt)
lda_logger.addHandler(hnd)
lda_logger.setLevel(logging.INFO)
init_gensim_train_logging()

# Wrap the corpus to present it as bags of words to Gensim
gensim_corpus = GensimCorpus(corpus, ignore_ids=ignore_ids)
Expand Down
14 changes: 14 additions & 0 deletions src/python/pimlico/modules/gensim/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,11 @@
"""
from __future__ import division

import logging
from builtins import object
from collections import Counter
from logging import getLogger

from pimlico.datatypes.corpora import is_invalid_doc

Expand Down Expand Up @@ -60,3 +63,14 @@ def word_relevance_for_topic(topic_word_probs, word_probs, l=0.6):
"""
import numpy as np
return l * np.log(topic_word_probs) + (1-l) * np.log(topic_word_probs / word_probs)


def init_gensim_train_logging():
# Set up logging, so that we see Gensim's progress as it trains
lda_logger = getLogger('gensim.models.ldamodel')
hnd = logging.StreamHandler()
hnd.setLevel(logging.INFO)
fmt = logging.Formatter('%(asctime)s - Gensim - %(levelname)s - %(message)s')
hnd.setFormatter(fmt)
lda_logger.addHandler(hnd)
lda_logger.setLevel(logging.INFO)

0 comments on commit 036a286

Please sign in to comment.