Skip to content

Commit

Permalink
Fixed bug in LDA doc-topic mapping
Browse files Browse the repository at this point in the history
Use correct attribute of document object.

Skip invalid docs
  • Loading branch information
markgw committed Feb 10, 2020
1 parent 6920956 commit 49af96b
Showing 1 changed file with 3 additions and 1 deletion.
4 changes: 3 additions & 1 deletion src/python/pimlico/modules/gensim/lda_doc_topics/execute.py
Original file line number Diff line number Diff line change
@@ -1,16 +1,18 @@
from builtins import range
from collections import Counter

from pimlico.core.modules.map import skip_invalid
from pimlico.core.modules.map.multiproc import multiprocessing_executor_factory


def worker_set_up(worker):
worker.model = worker.info.get_input("model").load_model()


@skip_invalid
def process_document(worker, archive_name, doc_name, doc):
# Get a bag of words for the document
bow = list(Counter(word for sentence in doc for word in sentence).items())
bow = list(Counter(word for sentence in doc.lists for word in sentence).items())
# Use the LDA model to infer a topic vector for the document
topic_weights = dict(worker.model[bow])
# The weights are a sparse vector: fill in the relevant values and leave the rest as 0
Expand Down

0 comments on commit 49af96b

Please sign in to comment.