From 0ffd13c43145256378bba9dbca4bff2c4dc8a212 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Sat, 17 Feb 2018 00:38:21 +0530 Subject: [PATCH 01/16] Fix docstrings for lda_worker --- gensim/models/lda_worker.py | 69 +++++++++++++++++++++++++++++++++---- 1 file changed, 63 insertions(+), 6 deletions(-) diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index 16110486d6..49c49c28df 100755 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -4,13 +4,13 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -Worker ("slave") process used in computing distributed LDA. Run this script \ -on every node in your cluster. If you wish, you may even run it multiple times \ -on a single machine, to make better use of multiple cores (just beware that \ -memory footprint increases accordingly). +"""Worker ("slave") process used in computing distributed LDA. +Run this script on every node in your cluster. If you wish, you may even +run it multiple times on a single machine, to make better use of multiple +cores (just beware that memory footprint increases accordingly). Example: python -m gensim.models.lda_worker + """ @@ -40,11 +40,35 @@ class Worker(object): + """Used as a Pyro class with exposed methods. + + Exposes every non-private method and property of the class automatically + to be available for remote access. + + Attributes + ---------- + model : :obj: of :class:`~gensim.models.ldamodel.LdaModel` + + """ + def __init__(self): + """Partly initializes the model.""" self.model = None @Pyro4.expose def initialize(self, myid, dispatcher, **model_params): + """Fully initializes the worker. + + Parameters + ---------- + myid : int + An ID number used to identify this worker in the dispatcher object. + dispatcher : :class:`~gensim.models.lda_dispatcher.Dispatcher` + The dispatcher responsible for scheduling this worker. + **model_params + Keyword parameters to initialize the inner LDA model, see :class:`~gensim.models.ldamodel.LdaModel`. + + """ self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? @@ -59,6 +83,12 @@ def initialize(self, myid, dispatcher, **model_params): def requestjob(self): """ Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. + + Raises + ------ + RuntimeError + Worker has to be initialised before receiving jobs. + """ if self.model is None: raise RuntimeError("worker must be initialized before receiving jobs") @@ -79,6 +109,14 @@ def requestjob(self): @utils.synchronous('lock_update') def processjob(self, job): + """Incrementally processes the job and potentially logs progress. + + Parameters + ---------- + job : {iterable of list of (int, float), scipy.sparse.csc} + Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`). + + """ logger.debug("starting to process job #%i", self.jobsdone) self.model.do_estep(job) self.jobsdone += 1 @@ -89,11 +127,20 @@ def processjob(self, job): @Pyro4.expose def ping(self): + """Test the connectivity with Worker.""" return True @Pyro4.expose @utils.synchronous('lock_update') def getstate(self): + """Log and get the LDA model's current state. + + Returns + ------- + result : :obj: of `~gensim.models.ldamodel.LdaState` + The current state. + + """ logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone) result = self.model.state assert isinstance(result, ldamodel.LdaState) @@ -104,6 +151,14 @@ def getstate(self): @Pyro4.expose @utils.synchronous('lock_update') def reset(self, state): + """Reset the worker by setting sufficient stats to 0. + + Parameters + ---------- + state : :obj: of :class:`~gensim.models.ldamodel.LdaState` + Encapsulates information for distributed computation of LdaModel objects. + + """ assert state is not None logger.info("resetting worker #%i", self.myid) self.model.state = state @@ -113,11 +168,13 @@ def reset(self, state): @Pyro4.oneway def exit(self): + """Terminate the worker.""" logger.info("terminating worker #%i", self.myid) os._exit(0) def main(): + """Set up argument parser,logger and launches pyro daemon.""" parser = argparse.ArgumentParser(description=__doc__) parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) @@ -146,4 +203,4 @@ def main(): if __name__ == '__main__': - main() + main() \ No newline at end of file From daab82d254fba5b70ae9ae48cf9bdcb234f6802e Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Tue, 20 Feb 2018 07:40:16 +0530 Subject: [PATCH 02/16] Fix pep8 errors --- gensim/models/lda_worker.py | 133 ++++++++++++++++++++++++++---------- 1 file changed, 98 insertions(+), 35 deletions(-) mode change 100755 => 100644 gensim/models/lda_worker.py diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py old mode 100755 new mode 100644 index 49c49c28df..8ba3084ea4 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -4,16 +4,66 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -"""Worker ("slave") process used in computing distributed LDA. +""":class:`~gensim.models.lda_worker.Worker` ("slave") process used in +computing distributed :class:`~gensim.models.ldamodel.LdaModel`. -Run this script on every node in your cluster. If you wish, you may even -run it multiple times on a single machine, to make better use of multiple +Run this script on every node in your cluster. If you wish, you may even +run it multiple times on a single machine,to make better use of multiple cores (just beware that memory footprint increases accordingly). -Example: python -m gensim.models.lda_worker -""" +Warnings +-------- +Requires installed `Pyro4 `_. +Distributed version works only in local network. + + +How to use distributed :class:`~gensim.models.ldamodel.LdaModel` +---------------------------------------------------------------- + + +#. Install needed dependencies (Pyro4) :: + + pip install gensim[distributed] + +#. Setup serialization (on each machine) :: + + export PYRO_SERIALIZERS_ACCEPTED=pickle + export PYRO_SERIALIZER=pickle + +#. Run nameserver :: + + python -m Pyro4.naming -n 0.0.0.0 & + +#. Run workers (on each machine) :: + + python -m gensim.models.lda_worker & + +#. Run dispatcher :: + + python -m gensim.models.lda_dispatcher & + +#. Run :class:`~gensim.models.lsimodel.LsiModel` in distributed mode :: + >>> from gensim.test.utils import common_corpus,common_dictionary + >>> from gensim.models import LsiModel + >>> + >>> model = LdaModel(common_corpus, id2word=common_dictionary, + distributed=True) +#. You can then infer topic distributions on new, unseen documents, with + + >>> doc_lda = model[doc_bow] + The model can be updated (trained) with new documents via + >>> lda.update(other_corpus) + + +Command line arguments +---------------------- + +.. program-output:: python -m gensim.models.lda_worker --help + :ellipsis: 0, -3 + +""" from __future__ import with_statement import os import sys @@ -33,7 +83,8 @@ logger = logging.getLogger('gensim.models.lda_worker') -# periodically save intermediate models after every SAVE_DEBUG updates (0 for never) +# periodically save intermediate models after every SAVE_DEBUG updates +# (0 for never) SAVE_DEBUG = 0 LDA_WORKER_PREFIX = 'gensim.lda_worker' @@ -47,10 +98,10 @@ class Worker(object): Attributes ---------- - model : :obj: of :class:`~gensim.models.ldamodel.LdaModel` + model : :class:`~gensim.models.ldamodel.LdaModel` """ - + def __init__(self): """Partly initializes the model.""" self.model = None @@ -58,7 +109,7 @@ def __init__(self): @Pyro4.expose def initialize(self, myid, dispatcher, **model_params): """Fully initializes the worker. - + Parameters ---------- myid : int @@ -66,12 +117,14 @@ def initialize(self, myid, dispatcher, **model_params): dispatcher : :class:`~gensim.models.lda_dispatcher.Dispatcher` The dispatcher responsible for scheduling this worker. **model_params - Keyword parameters to initialize the inner LDA model, see :class:`~gensim.models.ldamodel.LdaModel`. + Keyword parameters to initialize the inner LDA model, + see :class:`~gensim.models.ldamodel.LdaModel`. """ self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? - # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? + # id of this worker in the dispatcher; + # just a convenience var for easy access/logging TODO remove? self.myid = myid self.dispatcher = dispatcher self.finished = False @@ -81,9 +134,9 @@ def initialize(self, myid, dispatcher, **model_params): @Pyro4.expose @Pyro4.oneway def requestjob(self): - """ - Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. - + """Request jobs from the dispatcher, in a perpetual loop + until `getstate()` is called. + Raises ------ RuntimeError @@ -91,7 +144,8 @@ def requestjob(self): """ if self.model is None: - raise RuntimeError("worker must be initialized before receiving jobs") + raise RuntimeError("worker must be initialized before \ + receiving jobs") job = None while job is None and not self.finished: @@ -101,7 +155,8 @@ def requestjob(self): # no new job: try again, unless we're finished with all work continue if job is not None: - logger.info("worker #%s received job #%i", self.myid, self.jobsdone) + logger.info("worker #%s received job #%i", + self.myid, self.jobsdone) self.processjob(job) self.dispatcher.jobdone(self.myid) else: @@ -110,11 +165,12 @@ def requestjob(self): @utils.synchronous('lock_update') def processjob(self, job): """Incrementally processes the job and potentially logs progress. - + Parameters ---------- job : {iterable of list of (int, float), scipy.sparse.csc} - Stream of document vectors or sparse matrix of shape (`num_terms`, `num_documents`). + Stream of document vectors or sparse matrix of + shape (`num_terms`, `num_documents`). """ logger.debug("starting to process job #%i", self.jobsdone) @@ -134,14 +190,15 @@ def ping(self): @utils.synchronous('lock_update') def getstate(self): """Log and get the LDA model's current state. - + Returns ------- - result : :obj: of `~gensim.models.ldamodel.LdaState` + result : `~gensim.models.ldamodel.LdaState` The current state. """ - logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone) + logger.info("worker #%i returning its state after %s jobs", + self.myid, self.jobsdone) result = self.model.state assert isinstance(result, ldamodel.LdaState) self.model.clear() # free up mem in-between two EM cycles @@ -152,11 +209,12 @@ def getstate(self): @utils.synchronous('lock_update') def reset(self, state): """Reset the worker by setting sufficient stats to 0. - + Parameters ---------- - state : :obj: of :class:`~gensim.models.ldamodel.LdaState` - Encapsulates information for distributed computation of LdaModel objects. + state : :class:`~gensim.models.ldamodel.LdaState` + Encapsulates information for distributed computation + of LdaModel objects. """ assert state is not None @@ -176,20 +234,24 @@ def exit(self): def main(): """Set up argument parser,logger and launches pyro daemon.""" parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) - parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) + parser.add_argument("--host", help="Nameserver hostname \ + (default:%(default)s)", default=None) + parser.add_argument("--port", help="Nameserver port \ + (default: %(default)s)", default=None, type=int) parser.add_argument( - "--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', - default=True, const=False - ) - parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) + "--no-broadcast", help="Disable broadcast \ + (default: %(default)s)", action='store_const', + default=True, const=False) + parser.add_argument("--hmac", help="Nameserver hmac key \ + (default: %(default)s)", default=None) parser.add_argument( - '-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", - const=logging.INFO, default=logging.WARNING + '-v', '--verbose', help='Verbose flag', action='store_const', + dest="loglevel", const=logging.INFO, default=logging.WARNING ) args = parser.parse_args() - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) + logging.basicConfig(format='%(asctime)s : %(levelname)s\ + : %(message)s', level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) ns_conf = { @@ -198,9 +260,10 @@ def main(): "port": args.port, "hmac_key": args.hmac } - utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), random_suffix=True, ns_conf=ns_conf) + utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), + random_suffix=True, ns_conf=ns_conf) logger.info("finished running %s", " ".join(sys.argv)) if __name__ == '__main__': - main() \ No newline at end of file + main() From 592d32a82a2b6d108445963897fb42cbad0cff51 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Tue, 20 Feb 2018 07:41:39 +0530 Subject: [PATCH 03/16] Fix docstrings for lda_dispatcher --- gensim/models/lda_dispatcher.py | 282 +++++++++++++++++++++++++------- 1 file changed, 223 insertions(+), 59 deletions(-) mode change 100755 => 100644 gensim/models/lda_dispatcher.py diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py old mode 100755 new mode 100644 index db7a33468b..90242f31d4 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -4,13 +4,68 @@ # Copyright (C) 2010 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""" -USAGE: %(program)s SIZE_OF_JOBS_QUEUE +""":class:`~gensim.models.lda_dispatcher.Dispatcher` process which orchestrates +distributed :class:`~gensim.models.ldamodel.LdaModel` computations. +Run this script only once, on the master node in your cluster. + +Notes +----- +The dispatches expects to find worker scripts already running. Make sure +you run as many workers as you like on your machines **before** launching +the dispatcher. + +Warnings +-------- +Requires installed `Pyro4 `_. +Distributed version works only in local network. + + +How to use distributed :class:`~gensim.models.ldamodel.LdaModel` +---------------------------------------------------------------- + + +#. Install needed dependencies (Pyro4) :: + + pip install gensim[distributed] + +#. Setup serialization (on each machine) :: + + export PYRO_SERIALIZERS_ACCEPTED=pickle + export PYRO_SERIALIZER=pickle + +#. Run nameserver :: + + python -m Pyro4.naming -n 0.0.0.0 & + +#. Run workers (on each machine) :: + + python -m gensim.models.lda_worker & - Dispatcher process which orchestrates distributed LDA computations. Run this \ -script only once, on any node in your cluster. +#. Run dispatcher :: + + python -m gensim.models.lda_dispatcher & + +#. Run :class:`~gensim.models.ldamodel.LdaModel` in distributed mode :: + + >>> from gensim.test.utils import common_corpus, common_dictionary + >>> from gensim.models import LdaModel + >>> + >>> model = LdaModel(common_corpus, id2word=common_dictionary, + distributed=True) + +#. You can then infer topic distributions on new, unseen documents, with + + >>> doc_lda = model[doc_bow] + The model can be updated (trained) with new documents via + >>> lda.update(other_corpus) + + +Command line arguments +---------------------- + +.. program-output:: python -m gensim.models.lda_dispatcher --help + :ellipsis: 0, -5 -Example: python -m gensim.models.lda_dispatcher """ @@ -36,9 +91,10 @@ # How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue? -# A small number is usually enough, unless iteration over the corpus is very very -# slow (slower than the actual computation of LDA), in which case you can override -# this value from command line. ie. run "python ./lda_dispatcher.py 100" +# A small number is usually enough, unless iteration over the corpus is +# very very slow (slower than the actual computation of LDA), in which case +# you can override this value from command line. ie. +# run "python ./lda_dispatcher.py 100" MAX_JOBS_QUEUE = 10 # timeout for the Queue object put/get blocking methods. @@ -50,78 +106,160 @@ class Dispatcher(object): - """ - Dispatcher object that communicates and coordinates individual workers. - + """Dispatcher object that communicates and coordinates individual workers. + + Attributes + ---------- + callback : :class: `~Pyro4.core.Proxy` + A proxy for some remote object.Intercepts method calls and + dispatches them to the remote object. + jobs : :class: `~Queue.Queue` + Constructs a FIFO queue. + lock_update : :class: `~threading.Lock` + This class implements primitive lock objects. Once a thread has + acquired a lock, subsequent attempts to acquire it block, until it is + released; any thread may release it. + + Warnings + -------- There should never be more than one dispatcher running at any one time. + """ def __init__(self, maxsize=MAX_JOBS_QUEUE, ns_conf=None): - """ - Note that the constructor does not fully initialize the dispatcher; - use the `initialize()` function to populate it with workers etc. + """Partly initializes the dispatcher. + + A full initialization (including initialization of the workers) + requires a call to + :meth:`~gensim.models.lda_dispatcher.Dispatcher.initialize` + + Parameters + ---------- + maxsize : int, optional + Maximum number of jobs to be kept pre-fetched in the queue. + ns_conf : dict of {str:(str,optional),str:(int,optional), + str:(bool:optional),str:(str,optional)},optional + Sets up the name server configuration for the pyro daemon server + of dispatcher.This also helps to keep track of your objects in + your netword by using logical object names instead of exact + object name(or id) and its location. + workers : dict of { int : :class: `~Pyro4.core.Proxy` } + Locates all available workers and store their proxies, for + subsequent RMI calls. + } + """ self.maxsize = maxsize - self.callback = None # a pyro proxy to this object (unknown at init time, but will be set later) + self.callback = None self.ns_conf = ns_conf if ns_conf is not None else {} @Pyro4.expose def initialize(self, **model_params): - """ - `model_params` are parameters used to initialize individual workers (gets - handed all the way down to `worker.initialize()`). + """Fully initializes the dispatcher and all its workers. + + Parameters + ---------- + **model_params + Keyword parameters used to initialize individual workers, + see:class:`~gensim.models.ldamodel.LdaModel`. + + Raises + ------ + RuntimeError + Description + """ self.jobs = Queue(maxsize=self.maxsize) self.lock_update = threading.Lock() self._jobsdone = 0 self._jobsreceived = 0 - # locate all available workers and store their proxies, for subsequent RMI calls self.workers = {} with utils.getNS(**self.ns_conf) as ns: - self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) + self.callback = Pyro4.Proxy(ns.list( + prefix=LDA_DISPATCHER_PREFIX) + [LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously - logger.info("registering worker #%i at %s", workerid, uri) - worker.initialize(workerid, dispatcher=self.callback, **model_params) + logger.info("registering worker #%i at %s", workerid, + uri) + worker.initialize(workerid, dispatcher=self.callback, + **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: - logger.warning("unresponsive worker at %s, deleting it from the name server", uri) + logger.warning("unresponsive worker at %s,deleting it" + " from the name server", uri) ns.remove(name) if not self.workers: - raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!') + raise RuntimeError('no workers found; run some lda_worker ' + 'scripts on your machines first!') @Pyro4.expose def getworkers(self): - """ - Return pyro URIs of all registered workers. + """Return pyro URIs of all registered workers. + + Returns + ------- + list of URIs + The pyro URIs for each worker. + """ return [worker._pyroUri for worker in itervalues(self.workers)] @Pyro4.expose def getjob(self, worker_id): + """Atomically pops a job from the queue. + + Parameters + ---------- + worker_id : int + The worker that requested the job. + + Returns + ------- + iterable of iterable of (int, float) + The corpus in BoW format. + + """ logger.info("worker #%i requesting a new job", worker_id) job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize()) + logger.info("worker #%i got a new job (%i left)", worker_id, + self.jobs.qsize()) return job @Pyro4.expose def putjob(self, job): + """Atomically add a job to the queue. + + Parameters + ---------- + job : iterable of iterable of (int, float) + The corpus in BoW format. + + """ self._jobsreceived += 1 self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize()) + logger.info("added a new job (len(queue)=%i items)", + self.jobs.qsize()) @Pyro4.expose def getstate(self): """ Merge states from across all workers and return the result. + + Returns + ------- + :class:`~gensim.models.ldamodel.LdaState` + Merged resultant state + """ logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived) + logger.debug("jobs done: %s, jobs received: %s", + self._jobsdone, self._jobsreceived) i = 0 count = 10 while self._jobsdone < self._jobsreceived: @@ -144,8 +282,14 @@ def getstate(self): @Pyro4.expose def reset(self, state): - """ - Initialize all workers for a new EM iterations. + """Reinitializes all workers for a new EM iteration. + + Parameters + ---------- + state : :class:`~gensim.models.ldamodel.LdaState` + Encapsulates information for distributed computation + of LdaModel objects. + """ for workerid, worker in iteritems(self.workers): logger.info("resetting worker %s", workerid) @@ -158,54 +302,72 @@ def reset(self, state): @Pyro4.oneway @utils.synchronous('lock_update') def jobdone(self, workerid): - """ - A worker has finished its job. Log this event and then asynchronously - transfer control back to the worker. + """Workers use callback to notify when their job is done. + + The job done event is logged and then control is asynchronously + transfered back to the worker(who can then request another job). + In this way, control flow basically oscillates between + :meth:`gensim.models.lda_dispatcher.Dispatcher.jobdone` and + :meth:`gensim.models.lda_worker.Worker.requestjob`. + + Parameters + ---------- + workerid : int + The ID of the worker that finished the job (used for logging). - In this way, control flow basically oscillates between `dispatcher.jobdone()` - and `worker.requestjob()`. """ self._jobsdone += 1 logger.info("worker #%s finished job #%i", workerid, self._jobsdone) - self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) + self.workers[workerid].requestjob() # tell the worker to ask for + # another job, asynchronously (one-way) def jobsdone(self): - """Wrap self._jobsdone, needed for remote access through Pyro proxies""" + """Wrap :attr:`~gensim.models.lda_dispatcher.Dispatcher._jobsdone` + ,needed for remote access through proxies. + + Returns + ------- + int + Number of jobs already completed. + + """ return self._jobsdone @Pyro4.oneway def exit(self): - """ - Terminate all registered workers and then the dispatcher. - """ + """Terminate all registered workers and then the dispatcher.""" for workerid, worker in iteritems(self.workers): logger.info("terminating worker %s", workerid) worker.exit() logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala sys.exit()) + os._exit(0) # exit the whole process (not just this thread ala + # sys.exit()) # endclass Dispatcher def main(): + """Set up argument parser,logger and launches pyro daemon.""" parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument( - "--maxsize", - help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", - type=int, default=MAX_JOBS_QUEUE - ) - parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) - parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) - parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", - action='store_const', default=True, const=False) - parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) - parser.add_argument( - '-v', '--verbose', - help='Verbose flag', - action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING - ) + parser.add_argument("--maxsize", help="How many jobs (=chunks of N " + "documents) to keep 'pre-fetched' in a queue " + "(default: %(default)s)", type=int, + default=MAX_JOBS_QUEUE) + parser.add_argument("--host", help="Nameserver hostname (default: " + "%(default)s)", default=None) + parser.add_argument("--port", help="Nameserver port (default: " + "%(default)s)", default=None, type=int) + parser.add_argument("--no-broadcast", help="Disable broadcast (default" + ": %(default)s)", action='store_const', default=True, + const=False) + parser.add_argument("--hmac", help="Nameserver hmac key (default: " + "%(default)s)", default=None) + parser.add_argument('-v', '--verbose', help='Verbose flag', + action='store_const', dest="loglevel", + const=logging.INFO, default=logging.WARNING) args = parser.parse_args() - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', + level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) ns_conf = { @@ -214,7 +376,9 @@ def main(): "port": args.port, "hmac_key": args.hmac } - utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher(maxsize=args.maxsize, ns_conf=ns_conf), ns_conf=ns_conf) + utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher( + maxsize=args.maxsize, ns_conf=ns_conf), + ns_conf=ns_conf) logger.info("finished running %s", " ".join(sys.argv)) From b3fd8c89e773c1366a516dbafc0423d11cc12541 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Tue, 20 Feb 2018 19:40:08 +0530 Subject: [PATCH 04/16] Fix multi-line docstring description problem --- gensim/models/lda_dispatcher.py | 22 +++++++++++----------- 1 file changed, 11 insertions(+), 11 deletions(-) diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index 90242f31d4..0644a695ca 100644 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -111,13 +111,13 @@ class Dispatcher(object): Attributes ---------- callback : :class: `~Pyro4.core.Proxy` - A proxy for some remote object.Intercepts method calls and + A proxy for some remote object.Intercepts method calls and \ dispatches them to the remote object. jobs : :class: `~Queue.Queue` Constructs a FIFO queue. lock_update : :class: `~threading.Lock` - This class implements primitive lock objects. Once a thread has - acquired a lock, subsequent attempts to acquire it block, until it is + This class implements primitive lock objects. Once a thread has \ + acquired a lock, subsequent attempts to acquire it block, until it is \ released; any thread may release it. Warnings @@ -137,14 +137,14 @@ def __init__(self, maxsize=MAX_JOBS_QUEUE, ns_conf=None): ---------- maxsize : int, optional Maximum number of jobs to be kept pre-fetched in the queue. - ns_conf : dict of {str:(str,optional),str:(int,optional), + ns_conf : dict of {str:(str,optional),str:(int,optional), \ str:(bool:optional),str:(str,optional)},optional - Sets up the name server configuration for the pyro daemon server - of dispatcher.This also helps to keep track of your objects in - your netword by using logical object names instead of exact + Sets up the name server configuration for the pyro daemon server \ + of dispatcher.This also helps to keep track of your objects in \ + your netword by using logical object names instead of exact \ object name(or id) and its location. workers : dict of { int : :class: `~Pyro4.core.Proxy` } - Locates all available workers and store their proxies, for + Locates all available workers and store their proxies, for \ subsequent RMI calls. } @@ -166,7 +166,7 @@ def initialize(self, **model_params): Raises ------ RuntimeError - Description + No workers found.Need to have atleast one worker running. """ self.jobs = Queue(maxsize=self.maxsize) @@ -322,8 +322,8 @@ def jobdone(self, workerid): # another job, asynchronously (one-way) def jobsdone(self): - """Wrap :attr:`~gensim.models.lda_dispatcher.Dispatcher._jobsdone` - ,needed for remote access through proxies. + """Wrap :attr:`~gensim.models.lda_dispatcher.Dispatcher._jobsdone`, + needed for remote access through proxies. Returns ------- From e833ede80e1d12db6a97b59a94d8675e98fe3bf3 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Wed, 21 Feb 2018 15:11:15 +0530 Subject: [PATCH 05/16] Fix indentation and char length issue --- gensim/models/lda_dispatcher.py | 89 +++++++++++++-------------------- gensim/models/lda_worker.py | 52 +++++++------------ 2 files changed, 53 insertions(+), 88 deletions(-) diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py index 0644a695ca..d10f9a0095 100644 --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -91,10 +91,9 @@ # How many jobs (=chunks of N documents) to keep "pre-fetched" in a queue? -# A small number is usually enough, unless iteration over the corpus is -# very very slow (slower than the actual computation of LDA), in which case -# you can override this value from command line. ie. -# run "python ./lda_dispatcher.py 100" +# A small number is usually enough, unless iteration over the corpus is very very +# slow (slower than the actual computation of LDA), in which case you can override +# this value from command line. ie. run "python ./lda_dispatcher.py 100" MAX_JOBS_QUEUE = 10 # timeout for the Queue object put/get blocking methods. @@ -129,8 +128,7 @@ class Dispatcher(object): def __init__(self, maxsize=MAX_JOBS_QUEUE, ns_conf=None): """Partly initializes the dispatcher. - A full initialization (including initialization of the workers) - requires a call to + A full initialization (including initialization of the workers) requires a call to :meth:`~gensim.models.lda_dispatcher.Dispatcher.initialize` Parameters @@ -144,9 +142,7 @@ def __init__(self, maxsize=MAX_JOBS_QUEUE, ns_conf=None): your netword by using logical object names instead of exact \ object name(or id) and its location. workers : dict of { int : :class: `~Pyro4.core.Proxy` } - Locates all available workers and store their proxies, for \ - subsequent RMI calls. - } + Locates all available workers and store their proxies, for subsequent RMI calls. """ self.maxsize = maxsize @@ -176,27 +172,21 @@ def initialize(self, **model_params): self.workers = {} with utils.getNS(**self.ns_conf) as ns: - self.callback = Pyro4.Proxy(ns.list( - prefix=LDA_DISPATCHER_PREFIX) - [LDA_DISPATCHER_PREFIX]) + self.callback = Pyro4.Proxy(ns.list(prefix=LDA_DISPATCHER_PREFIX)[LDA_DISPATCHER_PREFIX]) for name, uri in iteritems(ns.list(prefix=LDA_WORKER_PREFIX)): try: worker = Pyro4.Proxy(uri) workerid = len(self.workers) # make time consuming methods work asynchronously - logger.info("registering worker #%i at %s", workerid, - uri) - worker.initialize(workerid, dispatcher=self.callback, - **model_params) + logger.info("registering worker #%i at %s", workerid, uri) + worker.initialize(workerid, dispatcher=self.callback, **model_params) self.workers[workerid] = worker except Pyro4.errors.PyroError: - logger.warning("unresponsive worker at %s,deleting it" - " from the name server", uri) + logger.warning("unresponsive worker at %s,deleting it from the name server", uri) ns.remove(name) if not self.workers: - raise RuntimeError('no workers found; run some lda_worker ' - 'scripts on your machines first!') + raise RuntimeError('no workers found; run some lda_worker scripts on your machines first!') @Pyro4.expose def getworkers(self): @@ -227,8 +217,7 @@ def getjob(self, worker_id): """ logger.info("worker #%i requesting a new job", worker_id) job = self.jobs.get(block=True, timeout=1) - logger.info("worker #%i got a new job (%i left)", worker_id, - self.jobs.qsize()) + logger.info("worker #%i got a new job (%i left)", worker_id, self.jobs.qsize()) return job @Pyro4.expose @@ -243,13 +232,11 @@ def putjob(self, job): """ self._jobsreceived += 1 self.jobs.put(job, block=True, timeout=HUGE_TIMEOUT) - logger.info("added a new job (len(queue)=%i items)", - self.jobs.qsize()) + logger.info("added a new job (len(queue)=%i items)", self.jobs.qsize()) @Pyro4.expose def getstate(self): - """ - Merge states from across all workers and return the result. + """Merge states from across all workers and return the result. Returns ------- @@ -258,8 +245,7 @@ def getstate(self): """ logger.info("end of input, assigning all remaining jobs") - logger.debug("jobs done: %s, jobs received: %s", - self._jobsdone, self._jobsreceived) + logger.debug("jobs done: %s, jobs received: %s", self._jobsdone, self._jobsreceived) i = 0 count = 10 while self._jobsdone < self._jobsreceived: @@ -318,12 +304,10 @@ def jobdone(self, workerid): """ self._jobsdone += 1 logger.info("worker #%s finished job #%i", workerid, self._jobsdone) - self.workers[workerid].requestjob() # tell the worker to ask for - # another job, asynchronously (one-way) + self.workers[workerid].requestjob() # tell the worker to ask for another job, asynchronously (one-way) def jobsdone(self): - """Wrap :attr:`~gensim.models.lda_dispatcher.Dispatcher._jobsdone`, - needed for remote access through proxies. + """Wrap :attr:`~gensim.models.lda_dispatcher.Dispatcher._jobsdone` needed for remote access through proxies. Returns ------- @@ -340,34 +324,31 @@ def exit(self): logger.info("terminating worker %s", workerid) worker.exit() logger.info("terminating dispatcher") - os._exit(0) # exit the whole process (not just this thread ala - # sys.exit()) + os._exit(0) # exit the whole process (not just this thread ala sys.exit()) # endclass Dispatcher def main(): """Set up argument parser,logger and launches pyro daemon.""" parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--maxsize", help="How many jobs (=chunks of N " - "documents) to keep 'pre-fetched' in a queue " - "(default: %(default)s)", type=int, - default=MAX_JOBS_QUEUE) - parser.add_argument("--host", help="Nameserver hostname (default: " - "%(default)s)", default=None) - parser.add_argument("--port", help="Nameserver port (default: " - "%(default)s)", default=None, type=int) - parser.add_argument("--no-broadcast", help="Disable broadcast (default" - ": %(default)s)", action='store_const', default=True, - const=False) - parser.add_argument("--hmac", help="Nameserver hmac key (default: " - "%(default)s)", default=None) - parser.add_argument('-v', '--verbose', help='Verbose flag', - action='store_const', dest="loglevel", - const=logging.INFO, default=logging.WARNING) + parser.add_argument( + "--maxsize", + help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", + type=int, default=MAX_JOBS_QUEUE + ) + parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) + parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) + parser.add_argument("--no-broadcast", help="Disable broadcast (default: %(default)s)", + action='store_const', default=True, const=False) + parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) + parser.add_argument( + '-v', '--verbose', + help='Verbose flag', + action='store_const', dest="loglevel", const=logging.INFO, default=logging.WARNING + ) args = parser.parse_args() - logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', - level=args.loglevel) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) ns_conf = { @@ -376,9 +357,7 @@ def main(): "port": args.port, "hmac_key": args.hmac } - utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher( - maxsize=args.maxsize, ns_conf=ns_conf), - ns_conf=ns_conf) + utils.pyro_daemon(LDA_DISPATCHER_PREFIX, Dispatcher(maxsize=args.maxsize, ns_conf=ns_conf), ns_conf=ns_conf) logger.info("finished running %s", " ".join(sys.argv)) diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index 8ba3084ea4..09b40ccb07 100644 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -83,8 +83,7 @@ logger = logging.getLogger('gensim.models.lda_worker') -# periodically save intermediate models after every SAVE_DEBUG updates -# (0 for never) +# periodically save intermediate models after every SAVE_DEBUG updates (0 for never) SAVE_DEBUG = 0 LDA_WORKER_PREFIX = 'gensim.lda_worker' @@ -93,8 +92,7 @@ class Worker(object): """Used as a Pyro class with exposed methods. - Exposes every non-private method and property of the class automatically - to be available for remote access. + Exposes every non-private method and property of the class automatically to be available for remote access. Attributes ---------- @@ -117,14 +115,12 @@ def initialize(self, myid, dispatcher, **model_params): dispatcher : :class:`~gensim.models.lda_dispatcher.Dispatcher` The dispatcher responsible for scheduling this worker. **model_params - Keyword parameters to initialize the inner LDA model, - see :class:`~gensim.models.ldamodel.LdaModel`. + Keyword parameters to initialize the inner LDA model,see :class:`~gensim.models.ldamodel.LdaModel`. """ self.lock_update = threading.Lock() self.jobsdone = 0 # how many jobs has this worker completed? - # id of this worker in the dispatcher; - # just a convenience var for easy access/logging TODO remove? + # id of this worker in the dispatcher; just a convenience var for easy access/logging TODO remove? self.myid = myid self.dispatcher = dispatcher self.finished = False @@ -134,8 +130,7 @@ def initialize(self, myid, dispatcher, **model_params): @Pyro4.expose @Pyro4.oneway def requestjob(self): - """Request jobs from the dispatcher, in a perpetual loop - until `getstate()` is called. + """Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. Raises ------ @@ -144,8 +139,7 @@ def requestjob(self): """ if self.model is None: - raise RuntimeError("worker must be initialized before \ - receiving jobs") + raise RuntimeError("worker must be initialized before receiving jobs") job = None while job is None and not self.finished: @@ -155,8 +149,7 @@ def requestjob(self): # no new job: try again, unless we're finished with all work continue if job is not None: - logger.info("worker #%s received job #%i", - self.myid, self.jobsdone) + logger.info("worker #%s received job #%i", self.myid, self.jobsdone) self.processjob(job) self.dispatcher.jobdone(self.myid) else: @@ -197,8 +190,7 @@ def getstate(self): The current state. """ - logger.info("worker #%i returning its state after %s jobs", - self.myid, self.jobsdone) + logger.info("worker #%i returning its state after %s jobs", self.myid, self.jobsdone) result = self.model.state assert isinstance(result, ldamodel.LdaState) self.model.clear() # free up mem in-between two EM cycles @@ -213,8 +205,7 @@ def reset(self, state): Parameters ---------- state : :class:`~gensim.models.ldamodel.LdaState` - Encapsulates information for distributed computation - of LdaModel objects. + Encapsulates information for distributed computation of LdaModel objects. """ assert state is not None @@ -234,24 +225,20 @@ def exit(self): def main(): """Set up argument parser,logger and launches pyro daemon.""" parser = argparse.ArgumentParser(description=__doc__) - parser.add_argument("--host", help="Nameserver hostname \ - (default:%(default)s)", default=None) - parser.add_argument("--port", help="Nameserver port \ - (default: %(default)s)", default=None, type=int) + parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) + parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) parser.add_argument( - "--no-broadcast", help="Disable broadcast \ - (default: %(default)s)", action='store_const', - default=True, const=False) - parser.add_argument("--hmac", help="Nameserver hmac key \ - (default: %(default)s)", default=None) + "--no-broadcast", help="Disable broadcast (default: %(default)s)", action='store_const', + default=True, const=False + ) + parser.add_argument("--hmac", help="Nameserver hmac key (default: %(default)s)", default=None) parser.add_argument( - '-v', '--verbose', help='Verbose flag', action='store_const', - dest="loglevel", const=logging.INFO, default=logging.WARNING + '-v', '--verbose', help='Verbose flag', action='store_const', dest="loglevel", + const=logging.INFO, default=logging.WARNING ) args = parser.parse_args() - logging.basicConfig(format='%(asctime)s : %(levelname)s\ - : %(message)s', level=args.loglevel) + logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=args.loglevel) logger.info("running %s", " ".join(sys.argv)) ns_conf = { @@ -260,8 +247,7 @@ def main(): "port": args.port, "hmac_key": args.hmac } - utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), - random_suffix=True, ns_conf=ns_conf) + utils.pyro_daemon(LDA_WORKER_PREFIX, Worker(), random_suffix=True, ns_conf=ns_conf) logger.info("finished running %s", " ".join(sys.argv)) From 4f58ac8ff57b4c366300fc75667582734884b923 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Sat, 24 Feb 2018 03:27:59 +0530 Subject: [PATCH 06/16] Fix docstrings for hdpmodel.py --- gensim/models/hdpmodel.py | 562 +++++++++++++++++++++++++++++++++----- 1 file changed, 495 insertions(+), 67 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 09239fa605..b6674e5e14 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -10,8 +10,7 @@ # -""" -This module encapsulates functionality for the online Hierarchical Dirichlet Process algorithm. +"""This module encapsulates functionality for the online Hierarchical Dirichlet Process algorithm. It allows both model estimation from a training corpus and inference of topic distribution on new, unseen documents. @@ -29,6 +28,29 @@ * runs in **constant memory** w.r.t. the number of documents: size of the training corpus does not affect memory footprint +How to use :class:`~gensim.models.hdpmodel.HdpModel` +---------------------------------------------------------------- + + +#. Run :class:`~gensim.models.hdpmodel.HdpModel` :: + + >>> from gensim.test.utils import common_corpus,common_dictionary + >>> from gensim.models import hdpmodel + >>> + >>> hdp = HdpModel(common_corpus, common_dictionary) + +#. You can then infer topic distributions on new, unseen documents, with + + >>> doc_hdp = hdp[doc_bow] + +#. To print 20 topics with top 10 most probable words. + + >>> hdp.print_topics(num_topics=20, num_words=10) + +#. The model can be updated (trained) with new documents via + + >>> hdp.update(other_corpus) + """ from __future__ import with_statement @@ -52,8 +74,18 @@ def expect_log_sticks(sticks): - """ - For stick-breaking hdp, return the E[log(sticks)] + """For stick-breaking hdp, return the E[log(sticks)]. + + Parameters + ---------- + sticks : numpy.ndarray + Array of values for stick. + + Returns + ------- + numpy.ndarray + Computed Elogsticks value. + """ dig_sum = psi(np.sum(sticks, 0)) ElogW = psi(sticks[0]) - dig_sum @@ -67,6 +99,27 @@ def expect_log_sticks(sticks): def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): + """Performs EM-iteration on a single document for calculation of likelihood for a maximum iteration of `max_iter`. + + Parameters + ---------- + doc_word_ids : int + Id of corresponding words in a document. + doc_word_counts : int + Count of words in a single document. + alpha : numpy.ndarray + Lda equivalent value of alpha. + beta : numpy.ndarray + Lda equivalent value of beta. + max_iter : int, optional + Maximum number of times the expectation will be maximised. + + Returns + ------- + tuple of numpy.ndarrays + Returns a tuple of (likelihood,gamma). + + """ gamma = np.ones(len(alpha)) expElogtheta = np.exp(dirichlet_expectation(gamma)) betad = beta[:, doc_word_ids] @@ -92,34 +145,67 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): class SuffStats(object): + """Stores suff stats for document(s).""" def __init__(self, T, Wt, Dt): + """Initialises the suff stats for document(s) in the corpus. + + Parameters + ---------- + T : int + Top level truncation level. + Wt : int + Length of words in the documents. + Dt : int + chunk size. + + """ self.m_chunksize = Dt self.m_var_sticks_ss = np.zeros(T) self.m_var_beta_ss = np.zeros((T, Wt)) def set_zero(self): + """Fill the sticks and beta array with 0 scalar value.""" self.m_var_sticks_ss.fill(0.0) self.m_var_beta_ss.fill(0.0) class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): - """ - The constructor estimates Hierachical Dirichlet Process model parameters based - on a training corpus: - - >>> hdp = HdpModel(corpus, id2word) - - You can infer topic distributions on new, unseen documents with - - >>> doc_hdp = hdp[doc_bow] - - Inference on new documents is based on the approximately LDA-equivalent topics. - - To print 20 topics with top 10 most probable words - - >>> hdp.print_topics(num_topics=20, num_words=10) - - Model persistency is achieved through its `load`/`save` methods. + """The constructor estimates Hierachical Dirichlet Process model parameters based on a training corpus. + + Attributes + ---------- + lda_alpha : numpy.ndarray + Lda equivalent value of alpha. + lda_beta : numpy.ndarray + Lda equivalent value of beta. + m_D : int + Number of documents in the corpus. + m_Elogbeta : numpy.ndarray: + Stores value of dirchlet excpectation, i.e., Computed + :math:`E[log \\theta]` for a vector :math:`\\theta \sim Dir(\\alpha)`. + m_lambda : numpy.ndarray or scalar + Drawn samples from the parameterized gamma distribution. + m_lambda_sum : numpy.ndarray or scalar + An array with the same shape as m_lambda, with the specified axis (1) removed. + m_num_docs_processed : int + Number of documents finished processing.This is incremented in size of chunks. + m_r : list + Acts as normaliser in lazy updation of lambda attribute. + m_rhot : float + Assigns weight to the information obtained from the mini-chunk and its value it between 0 and 1. + m_status_up_to_date : bool + Flag to indicate whether lambda and Elogbeta have been updated(T) or not(F). + m_timestamp : numpy.ndarray + Helps to keep track and perform lazy updates on lambda. + m_updatect : int + Keeps track of current time and is incremented everytime + :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda()` is called. + m_var_sticks : numpy.ndarray + Array of values for stick. + m_varphi_ss : numpy.ndarray + Used to Update top level sticks. + m_W : int + Length of dictionary for the input corpus. """ @@ -127,18 +213,46 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): - """ - `gamma`: first level concentration - `alpha`: second level concentration - `eta`: the topic Dirichlet - `T`: top level truncation level - `K`: second level truncation level - `kappa`: learning rate - `tau`: slow down parameter - `max_time`: stop training after this many seconds - `max_chunks`: stop after having processed this many chunks (wrap around - corpus beginning in another corpus pass, if there are not enough chunks - in the corpus) + """Fully initialises the hdp model. + + Parameters + ---------- + corpus : list of list of tuple of ints; [ [ (int,int) ]] + Corpus of input dataset on which the model will be trained. + id2word : :class:`~gensim.corpora.dictionary.Dictionary` + Dictionary for the input corpus. + max_chunks : None, optional + Upper bound on how many chunks to process.It wraps around corpus beginning in another corpus pass, + if there are not enough chunks in the corpus + max_time : None, optional + Upper bound on time(in seconds) for which model will be trained. + chunksize : int, optional + Tells the number of documents to process at a time. + kappa : float, optional + Learning rate + tau : float, optional + Slow down parameter + K : int, optional + Second level truncation level + T : int, optional + Top level truncation level + alpha : int, optional + Second level concentration + gamma : int, optional + First level concentration + eta : float, optional + The topic Dirichlet + scale : float, optional + Weights information from the mini-chunk of corpus to calculate rhot. + var_converge : float, optional + Lower bound on the right side of convergence. Used when updating variational parameters for a + single document. + outputdir : str, optional + Stores topic and options information in the specified directory. + random_state : :class:`~np.random.RandomState`, optional + Adds a little random jitter to randomize results around same alpha when trying to fetch a closest + corrsponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model()` + """ self.corpus = corpus self.id2word = id2word @@ -192,6 +306,24 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.update(corpus) def inference(self, chunk): + """Infers the gamma value on a trained corpus. + + Parameters + ---------- + chunk : list of tuple of ints; [ [ (int,int) ]] + Bag of words representation for a corpus. + + Returns + ------- + numpy.ndarray + gamma value. + + Raises + ------ + RuntimeError + Need to train model first to do inference. + + """ if self.lda_alpha is None or self.lda_beta is None: raise RuntimeError("model must be trained to perform inference") chunk = list(chunk) @@ -208,6 +340,20 @@ def inference(self, chunk): return gamma def __getitem__(self, bow, eps=0.01): + """Accessor method for generating topic distribution of given document. + + Parameters + ---------- + bow : sequence of list of tuple of ints; [ (int,int) ] + Bag-of-words representation of the document to get topics for. + eps : float, optional + Ignore topics with probability below `eps`. + + Returns + ------- + topic distribution for the given document `bow`, as a list of `(topic_id, topic_probability)` 2-tuples. + + """ is_corpus, corpus = utils.is_corpus(bow) if is_corpus: return self._apply(corpus) @@ -217,6 +363,15 @@ def __getitem__(self, bow, eps=0.01): return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps] def update(self, corpus): + """Train the model with new documents, by EM-iterating over `corpus` until + any of the conditions is satisfied(time limit expired,chunk limit reached or whole corpus processed ). + + Parameters + ---------- + corpus : list of list of tuple of ints; [ [ (int,int) ]] + The corpus on which Hdp model will be updated. + + """ save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly chunks_processed = 0 start_time = time.clock() @@ -244,6 +399,25 @@ def update(self, corpus): logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D) def update_finished(self, start_time, chunks_processed, docs_processed): + """Flag to determine whether the Hdp model has been updated with the new corpus or not. + + Parameters + ---------- + start_time : float + Indicates the current processor time as a floating point number expressed in seconds. The resolution is + typically better on Windows than on Unix by one microsecond due to differing implementation of underlying + function calls. + chunks_processed : int + Indicates progress of the update in terms of the number of chunks processed. + docs_processed : int + Indicates number of documents finished processing.This is incremented in size of chunks. + + Returns + ------- + bool + True if Hdp model is updated, False otherwise. + + """ return ( # chunk limit reached (self.max_chunks and chunks_processed == self.max_chunks) or @@ -255,6 +429,24 @@ def update_finished(self, start_time, chunks_processed, docs_processed): (not self.max_chunks and not self.max_time and docs_processed >= self.m_D)) def update_chunk(self, chunk, update=True, opt_o=True): + """Performs lazy update on necessary columns of lambda and variational inference for documents in the chunk. + + Parameters + ---------- + chunk : list of list of tuple of ints; [ [ (int,int) ]] + The chunk of corpus on which Hdp model will be updated. + update : bool, optional + Flag to determine whether to update lambda(T) or not (F). + opt_o : bool, optional + Passed as argument to :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda()` to determine whether + the topics need to be ordered(T) or not(F). + + Returns + ------- + tuple of (float,int) + A tuple of likelihood and sum of all the word counts from each document in the corpus. + + """ # Find the unique words in this chunk... unique_words = dict() word_list = [] @@ -297,8 +489,29 @@ def update_chunk(self, chunk, update=True, opt_o=True): return score, count def doc_e_step(self, ss, Elogsticks_1st, unique_words, doc_word_ids, doc_word_counts, var_converge): - """ - e step for a single doc + """Performs e step for a single doc. + + Parameters + ---------- + ss : :class:`~gensim.models.hdpmodel.SuffStats` + Suffstats for all document(s) in the chunk. + Elogsticks_1st : numpy.ndarray + Computed Elogsticks value by stick-breaking process. + unique_words : int + Number of unique words in the chunk. + doc_word_ids : tuple of int + Word ids of for a single document. + doc_word_counts : tuple of int + Word counts of all words in a single document. + var_converge : float, optional + Lower bound on the right side of convergence. Used when updating variational parameters for a + single document. + + Returns + ------- + float + Computed value of likelihood for a single document. + """ chunkids = [unique_words[id] for id in doc_word_ids] @@ -382,6 +595,19 @@ def doc_e_step(self, ss, Elogsticks_1st, unique_words, doc_word_ids, doc_word_co return likelihood def update_lambda(self, sstats, word_list, opt_o): + """Updates appropriate columns of lambda and top level sticks based on documents. + + Parameters + ---------- + sstats : :class:`~gensim.models.hdpmodel.SuffStats` + Suffstats for all document(s) in the chunk. + word_list : list of int + Contains word id of all the unique words in the chunk of documents on which update is being performed. + opt_o : bool, optional + Flag to determine whether to invoke a call to :meth:`~gensim.models.hdpmodel.HdpModel.optimal_ordering()`. + This decides whether the topics need to be ordered(T) or not(F). + + """ self.m_status_up_to_date = False # rhot will be between 0 and 1, and says how much to weight # the information we got from this mini-chunk. @@ -412,9 +638,7 @@ def update_lambda(self, sstats, word_list, opt_o): self.m_var_sticks[1] = np.flipud(np.cumsum(var_phi_sum)) + self.m_gamma def optimal_ordering(self): - """ - ordering the topics - """ + """Performs ordering on the topics.""" idx = matutils.argsort(self.m_lambda_sum, reverse=True) self.m_varphi_ss = self.m_varphi_ss[idx] self.m_lambda = self.m_lambda[idx, :] @@ -422,12 +646,10 @@ def optimal_ordering(self): self.m_Elogbeta = self.m_Elogbeta[idx, :] def update_expectations(self): - """ - Since we're doing lazy updates on lambda, at any given moment - the current state of lambda may not be accurate. This function - updates all of the elements of lambda and Elogbeta - so that if (for example) we want to print out the - topics we've learned we'll get the correct behavior. + """Since we're doing lazy updates on lambda, at any given moment the current state of lambda may not be + accurate. This function updates all of the elements of lambda and Elogbeta so that if (for example) we want to + print out the topics we've learned we'll get the correct behavior. + """ for w in xrange(self.m_W): self.m_lambda[:, w] *= np.exp(self.m_r[-1] - self.m_r[self.m_timestamp[w]]) @@ -438,11 +660,28 @@ def update_expectations(self): self.m_status_up_to_date = True def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None): - """ - Print the `num_words` most probable words for topic `topic_id`. - - Set `formatted=True` to return the topics as a list of strings, or - `False` as lists of (weight, word) pairs. + """Print the `num_words` most probable words for topic `topic_id`. + + Parameters + ---------- + topic_id : int + Acts as a representative index for a particular topic. + topn : int, optional + Number of most probable words to show from given `topic_id`. + log : bool, optional + Logs a message with level INFO on the logger object. + formatted : bool, optional + Flag to determine whether to return the topics as a list of strings(T), or as lists of + (weight, word) pairs(F). + num_words : int, optional + Number of most probable words to show from given `topic_id`. + + .. note:: The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. + + Returns + ------- + list of tuple of (unicode,numpy.float64) or list of str + Topic terms output displayed whose format depends on `formatted` parameter. """ if num_words is not None: # deprecated num_words is used @@ -458,21 +697,38 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No return hdp_formatter.show_topic(topic_id, topn, log, formatted) def get_topics(self): - """ - Returns: - np.ndarray: `num_topics` x `vocabulary_size` array of floats which represents - the term topic matrix learned during inference. + """Returns the term topic matrix learned during inference. + + Returns + ------- + np.ndarray + `num_topics` x `vocabulary_size` array of floats + """ topics = self.m_lambda + self.m_eta return topics / topics.sum(axis=1)[:, None] def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): - """ - Print the `num_words` most probable words for `num_topics` number of topics. - Set `num_topics=-1` to print all topics. - - Set `formatted=True` to return the topics as a list of strings, or - `False` as lists of (weight, word) pairs. + """Print the `num_words` most probable words for `num_topics` number of topics. + Set `num_topics=-1` to print all topics.Set `formatted=True` to return the topics as a list of strings, or + `False` as lists of (word, weight) pairs. + + Parameters + ---------- + num_topics : int, optional + Number of topics for which most probable `num_words` words will be fetched. + num_words : int, optional + Number of most probable words to show from `num_topics` number of topics. + log : bool, optional + Logs a message with level INFO on the logger object. + formatted : bool, optional + Flag to determine whether to return the topics as a list of strings(T), or as lists of + (word, weight) pairs(F). + + Returns + ------- + list of tuple of (unicode,numpy.float64) or list of str + Output format for topic terms depends on the value of `formatted` parameter. """ if not self.m_status_up_to_date: @@ -482,7 +738,16 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): return hdp_formatter.show_topics(num_topics, num_words, log, formatted) def save_topics(self, doc_count=None): - """legacy method; use `self.save()` instead""" + """Saves all the topics discovered. + + .. note:: This is a legacy method; use `self.save()` instead. + + Parameters + ---------- + doc_count : int, optional + Indicates number of documents finished processing and are to be saved. + + """ if not self.outputdir: logger.error("cannot store topics without having specified an output directory") @@ -496,7 +761,11 @@ def save_topics(self, doc_count=None): np.savetxt(fname, betas) def save_options(self): - """legacy method; use `self.save()` instead""" + """Writes all the values of the attributes for the current model in options.dat file. + + .. note:: This is a legacy method; use `self.save()` instead. + + """ if not self.outputdir: logger.error("cannot store options without having specified an output directory") return @@ -515,8 +784,13 @@ def save_options(self): fout.write('gamma: %s\n' % str(self.m_gamma)) def hdp_to_lda(self): - """ - Compute the LDA almost equivalent HDP. + """Only returns corresponding alpha, beta values of a LDA almost equivalent to current HDP. + + Returns + ------- + tuple of numpy.ndarray + Tuple of numpy arrays of alpha and beta. + """ # alpha sticks = self.m_var_sticks[0] / (self.m_var_sticks[0] + self.m_var_sticks[1]) @@ -534,10 +808,14 @@ def hdp_to_lda(self): return alpha, beta def suggested_lda_model(self): - """ - Returns closest corresponding ldamodel object corresponding to current hdp model. - The hdp_to_lda method only returns corresponding alpha, beta values, and this method returns a trained ldamodel. - The num_topics is m_T (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. + """Returns a trained ldamodel object which is closest to the current hdp model.The num_topics is m_T + (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. + + Returns + ------- + :class:`~gensim.models.ldamodel.LdaModel` + Closest corresponding LdaModel to current HdpModel. + """ alpha, beta = self.hdp_to_lda() ldam = ldamodel.LdaModel( @@ -547,6 +825,19 @@ def suggested_lda_model(self): return ldam def evaluate_test_corpus(self, corpus): + """Evaluates the model on test corpus. + + Parameters + ---------- + corpus : list of list of tuple of ints; [ [ (int,int) ]] + The corpus on which Hdp model will be tested. + + Returns + ------- + float + The value of total likelihood obtained by evaluating the model for all documents in the test corpus. + + """ logger.info('TEST: evaluating test corpus') if self.lda_alpha is None or self.lda_beta is None: self.lda_alpha, self.lda_beta = self.hdp_to_lda() @@ -571,9 +862,34 @@ def evaluate_test_corpus(self, corpus): class HdpTopicFormatter(object): + """Helper class to format the output of topics and most probable words for display.""" + (STYLE_GENSIM, STYLE_PRETTY) = (1, 2) def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None): + """Initialises the :class:`gensim.models.hdpmodel.HdpTopicFormatter` and stores topic data in sorted order. + + Parameters + ---------- + dictionary : :class:`~gensim.corpora.dictionary.Dictionary`,optional + Dictionary for the input corpus. + topic_data : numpy.ndarray, optional + The term topic matrix. + topic_file : file, str, or pathlib.Path + File, filename, or generator to read. If the filename extension is .gz or .bz2, the file is first + decompressed. Note that generators should return byte strings for Python 3k. + style : bool, optional + Flag to determine whether to return the topics as a list of strings(T), or as lists of (word, weight) + pairs(F). + data: numpy.ndarray + Sorted topic data in descending order of sum of probabilities for all words in corresponding topic. + + Raises + ------ + ValueError + Either no dictionary or no topic data. + + """ if dictionary is None: raise ValueError('no dictionary!') @@ -597,9 +913,44 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None self.style = style def print_topics(self, num_topics=10, num_words=10): + """Gives the most probable `num_words` words from `num_topics` topics. + + Parameters + ---------- + num_topics : int, optional + Top `num_topics` to be printed. + num_words : int, optional + Top `num_words` most probable words to be printed from each topic. + + Returns + ------- + list of tuple of (unicode,numpy.float64) or list of str + Output format for `num_words` words from `num_topics` topics depends on the value of `self.style` attribute. + + """ return self.show_topics(num_topics, num_words, True) def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): + """Gives the most probable `num_words` words from `num_topics` topics. + + Parameters + ---------- + num_topics : int, optional + Top `num_topics` to be printed. + num_words : int, optional + Top `num_words` most probable words to be printed from each topic. + log : bool, optional + Logs a message with level INFO on the logger object. + formatted : bool, optional + Flag to determine whether to return the topics as a list of strings(T), or as lists of + (word, weight) pairs(F). + + Returns + ------- + list of tuple of (int ,list of tuple of (unicode,numpy.float64) or list of str) + Output format for terms from `num_topics` topics depends on the value of `self.style` attribute. + + """ shown = [] if num_topics < 0: num_topics = len(self.data) @@ -628,6 +979,27 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown def print_topic(self, topic_id, topn=None, num_words=None): + """Prints the `topn` most probable words from topic id `topic_id`. + + Note + ---- + The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. + + Parameters + ---------- + topic_id : int + Acts as a representative index for a particular topic. + topn : int, optional + Number of most probable words to show from given `topic_id`. + num_words : int, optional + Number of most probable words to show from given `topic_id`. + + Returns + ------- + list of tuple of (unicode,numpy.float64) or list of str + Output format for terms from a single topic depends on the value of `formatted` parameter. + + """ if num_words is not None: # deprecated num_words is used warnings.warn( "The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead." @@ -637,6 +1009,32 @@ def print_topic(self, topic_id, topn=None, num_words=None): return self.show_topic(topic_id, topn, formatted=True) def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None,): + """Gives the most probable `num_words` words for the id `topic_id`. + + Note + ---- + The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. + + Parameters + ---------- + topic_id : int + Acts as a representative index for a particular topic. + topn : int, optional + Number of most probable words to show from given `topic_id`. + log : bool, optional + Logs a message with level INFO on the logger object. + formatted : bool, optional + Flag to determine whether to return the topics as a list of strings(T), or as lists of + (word, weight) pairs(F). + num_words : int, optional + Number of most probable words to show from given `topic_id`. + + Returns + ------- + list of tuple of (unicode,numpy.float64) or list of str + Output format for terms from a single topic depends on the value of `self.style` attribute. + + """ if num_words is not None: # deprecated num_words is used warnings.warn( "The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead." @@ -664,9 +1062,39 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No return topic[1] def show_topic_terms(self, topic_data, num_words): + """Gives the topic terms along with their probabilities for a single topic data. + + Parameters + ---------- + topic_data : list of tuple of (unicode,numpy.float64) + Contains probabilities for each word id belonging to a single topic. + num_words : int + Number of words for which probabilities are to be extracted from the given single topic data. + + Returns + ------- + list of tuple of (unicode,numpy.float64) + A sequence of topic terms and their probabilities. + + """ return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]] def format_topic(self, topic_id, topic_terms): + """Formats the display for a single topic in two different ways. + + Parameters + ---------- + topic_id : int + Acts as a representative index for a particular topic. + topic_terms : list of tuple of (unicode,numpy.float64) + Contains the most probable words from a single topic. + + Returns + ------- + list of tuple of (unicode,numpy.float64) or list of str + Output format for topic terms depends on the value of `self.style` attribute. + + """ if self.STYLE_GENSIM == self.style: fmt = ' + '.join(['%.3f*%s' % (weight, word) for (word, weight) in topic_terms]) else: From 430f7724d82f347843be6d7d8cee06888cad6057 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 12 Mar 2018 17:26:16 +0500 Subject: [PATCH 07/16] Fix lda_worker (+ small fix for lsi_worker) --- gensim/models/lda_worker.py | 46 +++++++++++++------------------------ gensim/models/lsi_worker.py | 9 ++++++-- 2 files changed, 23 insertions(+), 32 deletions(-) mode change 100644 => 100755 gensim/models/lda_worker.py diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py old mode 100644 new mode 100755 index 09b40ccb07..c085158778 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -4,17 +4,15 @@ # Copyright (C) 2011 Radim Rehurek # Licensed under the GNU LGPL v2.1 - http://www.gnu.org/licenses/lgpl.html -""":class:`~gensim.models.lda_worker.Worker` ("slave") process used in -computing distributed :class:`~gensim.models.ldamodel.LdaModel`. +""":class:`~gensim.models.lda_worker.Worker` ("slave") process used in computing +distributed :class:`~gensim.models.ldamodel.LdaModel`. -Run this script on every node in your cluster. If you wish, you may even -run it multiple times on a single machine,to make better use of multiple -cores (just beware that memory footprint increases accordingly). +Run this script on every node in your cluster. If you wish, you may even run it multiple times on a single machine, +to make better use of multiple cores (just beware that memory footprint increases accordingly). Warnings -------- Requires installed `Pyro4 `_. -Distributed version works only in local network. How to use distributed :class:`~gensim.models.ldamodel.LdaModel` @@ -42,26 +40,19 @@ python -m gensim.models.lda_dispatcher & -#. Run :class:`~gensim.models.lsimodel.LsiModel` in distributed mode :: +#. Run :class:`~gensim.models.lsimodel.LdaModel` in distributed mode :: >>> from gensim.test.utils import common_corpus,common_dictionary - >>> from gensim.models import LsiModel + >>> from gensim.models import LdaModel >>> - >>> model = LdaModel(common_corpus, id2word=common_dictionary, - distributed=True) - -#. You can then infer topic distributions on new, unseen documents, with - - >>> doc_lda = model[doc_bow] - The model can be updated (trained) with new documents via - >>> lda.update(other_corpus) + >>> model = LdaModel(common_corpus, id2word=common_dictionary, distributed=True) Command line arguments ---------------------- .. program-output:: python -m gensim.models.lda_worker --help - :ellipsis: 0, -3 + :ellipsis: 0, -7 """ from __future__ import with_statement @@ -90,14 +81,10 @@ class Worker(object): - """Used as a Pyro class with exposed methods. + """Used as a Pyro4 class with exposed methods. Exposes every non-private method and property of the class automatically to be available for remote access. - Attributes - ---------- - model : :class:`~gensim.models.ldamodel.LdaModel` - """ def __init__(self): @@ -130,12 +117,13 @@ def initialize(self, myid, dispatcher, **model_params): @Pyro4.expose @Pyro4.oneway def requestjob(self): - """Request jobs from the dispatcher, in a perpetual loop until `getstate()` is called. + """Request jobs from the dispatcher, in a perpetual loop until :meth:`~gensim.models.lda_worker.Worker.getstate` + is called. Raises ------ RuntimeError - Worker has to be initialised before receiving jobs. + If `self.model` is None (i.e. worker non initialized). """ if self.model is None: @@ -161,9 +149,8 @@ def processjob(self, job): Parameters ---------- - job : {iterable of list of (int, float), scipy.sparse.csc} - Stream of document vectors or sparse matrix of - shape (`num_terms`, `num_documents`). + job : iterable of list of (int, float) + Corpus in BoW format. """ logger.debug("starting to process job #%i", self.jobsdone) @@ -186,7 +173,7 @@ def getstate(self): Returns ------- - result : `~gensim.models.ldamodel.LdaState` + result : :class:`~gensim.models.ldamodel.LdaState` The current state. """ @@ -223,8 +210,7 @@ def exit(self): def main(): - """Set up argument parser,logger and launches pyro daemon.""" - parser = argparse.ArgumentParser(description=__doc__) + parser = argparse.ArgumentParser(description=__doc__[:-130], formatter_class=argparse.RawTextHelpFormatter) parser.add_argument("--host", help="Nameserver hostname (default: %(default)s)", default=None) parser.add_argument("--port", help="Nameserver port (default: %(default)s)", default=None, type=int) parser.add_argument( diff --git a/gensim/models/lsi_worker.py b/gensim/models/lsi_worker.py index 5f4ccc5c2f..2a4a66bb9e 100755 --- a/gensim/models/lsi_worker.py +++ b/gensim/models/lsi_worker.py @@ -112,8 +112,13 @@ def initialize(self, myid, dispatcher, **model_params): @Pyro4.expose @Pyro4.oneway def requestjob(self): - """Request jobs from the dispatcher, in a perpetual loop until - :meth:`~gensim.models.lsi_worker.Worker.getstate()` is called. + """Request jobs from the dispatcher, in a perpetual loop until :meth:`~gensim.models.lsi_worker.Worker.getstate` + is called. + + Raises + ------ + RuntimeError + If `self.model` is None (i.e. worker non initialized). """ if self.model is None: From 725a102ea089347ea4eb9a414a4104136ad10870 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 12 Mar 2018 17:47:45 +0500 Subject: [PATCH 08/16] Fix lda_dispatcher (+ small fix for lsi) --- gensim/models/lda_dispatcher.py | 70 ++++++++++----------------------- gensim/models/lda_worker.py | 2 +- gensim/models/lsi_dispatcher.py | 9 ++--- 3 files changed, 25 insertions(+), 56 deletions(-) mode change 100644 => 100755 gensim/models/lda_dispatcher.py diff --git a/gensim/models/lda_dispatcher.py b/gensim/models/lda_dispatcher.py old mode 100644 new mode 100755 index d10f9a0095..c6865981ab --- a/gensim/models/lda_dispatcher.py +++ b/gensim/models/lda_dispatcher.py @@ -10,14 +10,12 @@ Notes ----- -The dispatches expects to find worker scripts already running. Make sure -you run as many workers as you like on your machines **before** launching -the dispatcher. +The dispatches expects to find worker scripts already running. Make sure you run as many workers as you like on +your machines **before** launching the dispatcher. Warnings -------- Requires installed `Pyro4 `_. -Distributed version works only in local network. How to use distributed :class:`~gensim.models.ldamodel.LdaModel` @@ -47,24 +45,17 @@ #. Run :class:`~gensim.models.ldamodel.LdaModel` in distributed mode :: - >>> from gensim.test.utils import common_corpus, common_dictionary + >>> from gensim.test.utils import common_corpus,common_dictionary >>> from gensim.models import LdaModel >>> - >>> model = LdaModel(common_corpus, id2word=common_dictionary, - distributed=True) - -#. You can then infer topic distributions on new, unseen documents, with - - >>> doc_lda = model[doc_bow] - The model can be updated (trained) with new documents via - >>> lda.update(other_corpus) + >>> model = LdaModel(common_corpus, id2word=common_dictionary, distributed=True) Command line arguments ---------------------- .. program-output:: python -m gensim.models.lda_dispatcher --help - :ellipsis: 0, -5 + :ellipsis: 0, -7 """ @@ -107,18 +98,6 @@ class Dispatcher(object): """Dispatcher object that communicates and coordinates individual workers. - Attributes - ---------- - callback : :class: `~Pyro4.core.Proxy` - A proxy for some remote object.Intercepts method calls and \ - dispatches them to the remote object. - jobs : :class: `~Queue.Queue` - Constructs a FIFO queue. - lock_update : :class: `~threading.Lock` - This class implements primitive lock objects. Once a thread has \ - acquired a lock, subsequent attempts to acquire it block, until it is \ - released; any thread may release it. - Warnings -------- There should never be more than one dispatcher running at any one time. @@ -135,14 +114,10 @@ def __init__(self, maxsize=MAX_JOBS_QUEUE, ns_conf=None): ---------- maxsize : int, optional Maximum number of jobs to be kept pre-fetched in the queue. - ns_conf : dict of {str:(str,optional),str:(int,optional), \ - str:(bool:optional),str:(str,optional)},optional - Sets up the name server configuration for the pyro daemon server \ - of dispatcher.This also helps to keep track of your objects in \ - your netword by using logical object names instead of exact \ - object name(or id) and its location. - workers : dict of { int : :class: `~Pyro4.core.Proxy` } - Locates all available workers and store their proxies, for subsequent RMI calls. + ns_conf : dict of (str, object) + Sets up the name server configuration for the pyro daemon server of dispatcher. + This also helps to keep track of your objects in your network by using logical object names + instead of exact object name(or id) and its location. """ self.maxsize = maxsize @@ -156,13 +131,12 @@ def initialize(self, **model_params): Parameters ---------- **model_params - Keyword parameters used to initialize individual workers, - see:class:`~gensim.models.ldamodel.LdaModel`. + Keyword parameters used to initialize individual workers, see :class:`~gensim.models.ldamodel.LdaModel`. Raises ------ RuntimeError - No workers found.Need to have atleast one worker running. + When no workers are found (the :mod:`gensim.models.lda_worker` script must be ran beforehand). """ self.jobs = Queue(maxsize=self.maxsize) @@ -211,7 +185,7 @@ def getjob(self, worker_id): Returns ------- - iterable of iterable of (int, float) + iterable of list of (int, float) The corpus in BoW format. """ @@ -226,7 +200,7 @@ def putjob(self, job): Parameters ---------- - job : iterable of iterable of (int, float) + job : iterable of list of (int, float) The corpus in BoW format. """ @@ -273,8 +247,7 @@ def reset(self, state): Parameters ---------- state : :class:`~gensim.models.ldamodel.LdaState` - Encapsulates information for distributed computation - of LdaModel objects. + State of :class:`~gensim.models.lda.LdaModel`. """ for workerid, worker in iteritems(self.workers): @@ -288,13 +261,11 @@ def reset(self, state): @Pyro4.oneway @utils.synchronous('lock_update') def jobdone(self, workerid): - """Workers use callback to notify when their job is done. + """Callback used by workers to notify when their job is done. - The job done event is logged and then control is asynchronously - transfered back to the worker(who can then request another job). - In this way, control flow basically oscillates between - :meth:`gensim.models.lda_dispatcher.Dispatcher.jobdone` and - :meth:`gensim.models.lda_worker.Worker.requestjob`. + The job done event is logged and then control is asynchronously transfered back to the worker + (who can then request another job). In this way, control flow basically oscillates between + :meth:`gensim.models.lda_dispatcher.Dispatcher.jobdone` and :meth:`gensim.models.lda_worker.Worker.requestjob`. Parameters ---------- @@ -319,18 +290,17 @@ def jobsdone(self): @Pyro4.oneway def exit(self): - """Terminate all registered workers and then the dispatcher.""" + """Terminate all workers and then the dispatcher.""" for workerid, worker in iteritems(self.workers): logger.info("terminating worker %s", workerid) worker.exit() logger.info("terminating dispatcher") os._exit(0) # exit the whole process (not just this thread ala sys.exit()) -# endclass Dispatcher def main(): """Set up argument parser,logger and launches pyro daemon.""" - parser = argparse.ArgumentParser(description=__doc__) + parser = argparse.ArgumentParser(description=__doc__[:-135], formatter_class=argparse.RawTextHelpFormatter) parser.add_argument( "--maxsize", help="How many jobs (=chunks of N documents) to keep 'pre-fetched' in a queue (default: %(default)s)", diff --git a/gensim/models/lda_worker.py b/gensim/models/lda_worker.py index c085158778..56314e8388 100755 --- a/gensim/models/lda_worker.py +++ b/gensim/models/lda_worker.py @@ -40,7 +40,7 @@ python -m gensim.models.lda_dispatcher & -#. Run :class:`~gensim.models.lsimodel.LdaModel` in distributed mode :: +#. Run :class:`~gensim.models.ldamodel.LdaModel` in distributed mode :: >>> from gensim.test.utils import common_corpus,common_dictionary >>> from gensim.models import LdaModel diff --git a/gensim/models/lsi_dispatcher.py b/gensim/models/lsi_dispatcher.py index e4c06ef307..af435999e2 100755 --- a/gensim/models/lsi_dispatcher.py +++ b/gensim/models/lsi_dispatcher.py @@ -127,7 +127,7 @@ def initialize(self, **model_params): Raises ------ RuntimeError - When no workers are found (the `gensim.scripts.lsi_worker` script must be ran beforehand). + When no workers are found (the :mod:`gensim.model.lsi_worker` script must be ran beforehand). """ self.jobs = Queue(maxsize=self.maxsize) @@ -192,7 +192,7 @@ def putjob(self, job): Parameters ---------- - job : iterable of iterable of (int, float) + job : iterable of list of (int, float) The corpus in BoW format. """ @@ -246,8 +246,7 @@ def jobdone(self, workerid): The job done event is logged and then control is asynchronously transfered back to the worker (who can then request another job). In this way, control flow basically oscillates between - :meth:`gensim.models.lsi_dispatcher.Dispatcher.jobdone` and - :meth:`gensim.models.lsi_worker.Worker.requestjob`. + :meth:`gensim.models.lsi_dispatcher.Dispatcher.jobdone` and :meth:`gensim.models.lsi_worker.Worker.requestjob`. Parameters ---------- @@ -273,7 +272,7 @@ def jobsdone(self): @Pyro4.oneway def exit(self): - """Terminate all registered workers and then the dispatcher.""" + """Terminate all workers and then the dispatcher.""" for workerid, worker in iteritems(self.workers): logger.info("terminating worker %s", workerid) worker.exit() From 7bae16eefba9cf25c489241fbbef8898dafddaf2 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 12 Mar 2018 20:08:56 +0500 Subject: [PATCH 09/16] partial fix for hdpmodel --- gensim/models/hdpmodel.py | 114 +++++++++++++++++--------------------- 1 file changed, 51 insertions(+), 63 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index b6674e5e14..ec1de7a6e8 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -10,49 +10,38 @@ # -"""This module encapsulates functionality for the online Hierarchical Dirichlet Process algorithm. +"""Module for `online Hierarchical Dirichlet Processing +`_. -It allows both model estimation from a training corpus and inference of topic -distribution on new, unseen documents. +The core estimation code is directly adapted from the `blei-lab/online-hdp `_ +from `Wang, Paisley, Blei: Online Variational Inference for the Hierarchical Dirichlet Process, JMLR (2011) +`_. -The core estimation code is directly adapted from the `onlinelhdp.py` script -by C. Wang see -**Wang, Paisley, Blei: Online Variational Inference for the Hierarchical Dirichlet -Process, JMLR (2011).** +Examples +-------- -http://jmlr.csail.mit.edu/proceedings/papers/v15/wang11a/wang11a.pdf -The algorithm: +#. Train :class:`~gensim.models.hdpmodel.HdpModel` - * is **streamed**: training documents come in sequentially, no random access, - * runs in **constant memory** w.r.t. the number of documents: size of the - training corpus does not affect memory footprint - -How to use :class:`~gensim.models.hdpmodel.HdpModel` ----------------------------------------------------------------- - - -#. Run :class:`~gensim.models.hdpmodel.HdpModel` :: - - >>> from gensim.test.utils import common_corpus,common_dictionary - >>> from gensim.models import hdpmodel - >>> - >>> hdp = HdpModel(common_corpus, common_dictionary) +>>> from gensim.test.utils import common_corpus, common_dictionary +>>> from gensim.models import HdpModel +>>> +>>> hdp = HdpModel(common_corpus, common_dictionary) #. You can then infer topic distributions on new, unseen documents, with - >>> doc_hdp = hdp[doc_bow] +>>> unseen_document = [(1, 3.), (2, 4)] +>>> doc_hdp = hdp[unseen_document] #. To print 20 topics with top 10 most probable words. - >>> hdp.print_topics(num_topics=20, num_words=10) +>>> topic_info = hdp.print_topics(num_topics=20, num_words=10) #. The model can be updated (trained) with new documents via - >>> hdp.update(other_corpus) +>>> hdp.update([[(1, 2)], [(1, 1), (4, 5)]]) """ - from __future__ import with_statement import logging @@ -74,7 +63,7 @@ def expect_log_sticks(sticks): - """For stick-breaking hdp, return the E[log(sticks)]. + """For stick-breaking hdp, get the :math:`\mathbb{E}[log(sticks)]`. Parameters ---------- @@ -84,7 +73,7 @@ def expect_log_sticks(sticks): Returns ------- numpy.ndarray - Computed Elogsticks value. + Computed :math:`\mathbb{E}[log(sticks)]`. """ dig_sum = psi(np.sum(sticks, 0)) @@ -117,7 +106,7 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): Returns ------- tuple of numpy.ndarrays - Returns a tuple of (likelihood,gamma). + Returns a tuple of (:math:`likelihood`, :math:`\\gamma`). """ gamma = np.ones(len(alpha)) @@ -133,7 +122,7 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): expElogtheta = np.exp(Elogtheta) phinorm = np.dot(expElogtheta, betad) + 1e-100 meanchange = np.mean(abs(gamma - lastgamma)) - if (meanchange < meanchangethresh): + if meanchange < meanchangethresh: break likelihood = np.sum(counts * np.log(phinorm)) @@ -141,13 +130,14 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): likelihood += np.sum(gammaln(gamma) - gammaln(alpha)) likelihood += gammaln(np.sum(alpha)) - gammaln(np.sum(gamma)) - return (likelihood, gamma) + return likelihood, gamma class SuffStats(object): """Stores suff stats for document(s).""" + def __init__(self, T, Wt, Dt): - """Initialises the suff stats for document(s) in the corpus. + """ Parameters ---------- @@ -156,7 +146,7 @@ def __init__(self, T, Wt, Dt): Wt : int Length of words in the documents. Dt : int - chunk size. + Chunk size. """ self.m_chunksize = Dt @@ -170,64 +160,62 @@ def set_zero(self): class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): - """The constructor estimates Hierachical Dirichlet Process model parameters based on a training corpus. + """`Hierarchical Dirichlet Process model `_ Attributes ---------- lda_alpha : numpy.ndarray - Lda equivalent value of alpha. + Same as :math:`\\alpha` from :class:`gensim.models.ldamodel.LdaModel`. lda_beta : numpy.ndarray - Lda equivalent value of beta. + Same as :math:`\\beta` from from :class:`gensim.models.ldamodel.LdaModel`. m_D : int Number of documents in the corpus. m_Elogbeta : numpy.ndarray: - Stores value of dirchlet excpectation, i.e., Computed - :math:`E[log \\theta]` for a vector :math:`\\theta \sim Dir(\\alpha)`. - m_lambda : numpy.ndarray or scalar + Stores value of dirichlet expectationn, i.e., compute :math:`E[log \\theta]` for a vector + :math:`\\theta \sim Dir(\\alpha)`. + m_lambda : {numpy.ndarray, float} Drawn samples from the parameterized gamma distribution. - m_lambda_sum : numpy.ndarray or scalar - An array with the same shape as m_lambda, with the specified axis (1) removed. + m_lambda_sum : {numpy.ndarray, float} + An array with the same shape as `m_lambda`, with the specified axis (1) removed. m_num_docs_processed : int Number of documents finished processing.This is incremented in size of chunks. m_r : list - Acts as normaliser in lazy updation of lambda attribute. + Acts as normaliser in lazy updating of `m_lambda` attribute. m_rhot : float Assigns weight to the information obtained from the mini-chunk and its value it between 0 and 1. m_status_up_to_date : bool - Flag to indicate whether lambda and Elogbeta have been updated(T) or not(F). + Flag to indicate whether `lambda `and :math:`E[log \\theta]` have been updated if True, otherwise - not. m_timestamp : numpy.ndarray Helps to keep track and perform lazy updates on lambda. m_updatect : int - Keeps track of current time and is incremented everytime - :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda()` is called. + Keeps track of current time and is incremented everytime :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda` + is called. m_var_sticks : numpy.ndarray Array of values for stick. m_varphi_ss : numpy.ndarray - Used to Update top level sticks. + Used to update top level sticks. m_W : int Length of dictionary for the input corpus. """ - def __init__(self, corpus, id2word, max_chunks=None, max_time=None, chunksize=256, kappa=1.0, tau=64.0, K=15, T=150, alpha=1, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): - """Fully initialises the hdp model. - + """ Parameters ---------- - corpus : list of list of tuple of ints; [ [ (int,int) ]] - Corpus of input dataset on which the model will be trained. + corpus : iterable of list of (int, float) + Corpus in BoW format. id2word : :class:`~gensim.corpora.dictionary.Dictionary` Dictionary for the input corpus. - max_chunks : None, optional - Upper bound on how many chunks to process.It wraps around corpus beginning in another corpus pass, - if there are not enough chunks in the corpus - max_time : None, optional - Upper bound on time(in seconds) for which model will be trained. + max_chunks : int, optional + Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass, + if there are not enough chunks in the corpusю + max_time : int, optional + Upper bound on time (in seconds) for which model will be trained. chunksize : int, optional - Tells the number of documents to process at a time. + Number of documents in one chunck kappa : float, optional Learning rate tau : float, optional @@ -306,22 +294,22 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, self.update(corpus) def inference(self, chunk): - """Infers the gamma value on a trained corpus. + """Infers the gamma value based for `chunk`. Parameters ---------- - chunk : list of tuple of ints; [ [ (int,int) ]] - Bag of words representation for a corpus. + chunk : iterable of list of (int, float) + Corpus in BoW format. Returns ------- numpy.ndarray - gamma value. + Gamma value. Raises ------ RuntimeError - Need to train model first to do inference. + If model doesn't trained yet. """ if self.lda_alpha is None or self.lda_beta is None: From b1458a8d139cf574d727de66abe620b13d77e219 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Mon, 19 Mar 2018 12:28:01 +0530 Subject: [PATCH 10/16] Fix changes reviewed, except description for model --- gensim/models/hdpmodel.py | 102 +++++++++++++++++++++----------------- 1 file changed, 56 insertions(+), 46 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index ec1de7a6e8..1ced5bb5b5 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -134,7 +134,9 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): class SuffStats(object): - """Stores suff stats for document(s).""" + """Stores sufficient statistics for the current chunk of document(s) whenever Hdp model is updated with new corpus. + These stats are used when updating lambda and top level sticks. The statistics include number of documents in the + chunk, length of words in the documents and top level truncation level.""" def __init__(self, T, Wt, Dt): """ @@ -216,10 +218,10 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, Upper bound on time (in seconds) for which model will be trained. chunksize : int, optional Number of documents in one chunck - kappa : float, optional - Learning rate - tau : float, optional - Slow down parameter + kappa: float,optional + Learning parameter which acts as exponential decay factor to influence extent of learning from each batch. + tau: float, optional + Learning parameter which down-weights early iterations of documents. K : int, optional Second level truncation level T : int, optional @@ -237,9 +239,9 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, single document. outputdir : str, optional Stores topic and options information in the specified directory. - random_state : :class:`~np.random.RandomState`, optional + random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional} Adds a little random jitter to randomize results around same alpha when trying to fetch a closest - corrsponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model()` + corrsponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model` """ self.corpus = corpus @@ -304,7 +306,7 @@ def inference(self, chunk): Returns ------- numpy.ndarray - Gamma value. + First level concentration, i.e., Gamma value. Raises ------ @@ -332,13 +334,14 @@ def __getitem__(self, bow, eps=0.01): Parameters ---------- - bow : sequence of list of tuple of ints; [ (int,int) ] + bow : iterable of list of (int, float) Bag-of-words representation of the document to get topics for. eps : float, optional Ignore topics with probability below `eps`. Returns ------- + list of (int, float) topic distribution for the given document `bow`, as a list of `(topic_id, topic_probability)` 2-tuples. """ @@ -352,7 +355,10 @@ def __getitem__(self, bow, eps=0.01): def update(self, corpus): """Train the model with new documents, by EM-iterating over `corpus` until - any of the conditions is satisfied(time limit expired,chunk limit reached or whole corpus processed ). + any of the conditions is satisfied + * time limit expired + * chunk limit reached + * whole corpus processed Parameters ---------- @@ -403,7 +409,7 @@ def update_finished(self, start_time, chunks_processed, docs_processed): Returns ------- bool - True if Hdp model is updated, False otherwise. + If True Hdp model is updated, False otherwise. """ return ( @@ -424,10 +430,10 @@ def update_chunk(self, chunk, update=True, opt_o=True): chunk : list of list of tuple of ints; [ [ (int,int) ]] The chunk of corpus on which Hdp model will be updated. update : bool, optional - Flag to determine whether to update lambda(T) or not (F). + If True then update lambda, False don't update lambda. opt_o : bool, optional - Passed as argument to :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda()` to determine whether - the topics need to be ordered(T) or not(F). + Passed as argument to :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda`. If True then the topics will + be ordered, False otherwise. Returns ------- @@ -592,8 +598,8 @@ def update_lambda(self, sstats, word_list, opt_o): word_list : list of int Contains word id of all the unique words in the chunk of documents on which update is being performed. opt_o : bool, optional - Flag to determine whether to invoke a call to :meth:`~gensim.models.hdpmodel.HdpModel.optimal_ordering()`. - This decides whether the topics need to be ordered(T) or not(F). + If True invokes a call to :meth:`~gensim.models.hdpmodel.HdpModel.optimal_ordering` to order the topics, + False otherwise. """ self.m_status_up_to_date = False @@ -657,14 +663,15 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No topn : int, optional Number of most probable words to show from given `topic_id`. log : bool, optional - Logs a message with level INFO on the logger object. + If True logs a message with level INFO on the logger object, False otherwise formatted : bool, optional - Flag to determine whether to return the topics as a list of strings(T), or as lists of - (weight, word) pairs(F). + If True return the topics as a list of strings, False return the topics as lists of (weight, word) pairs. num_words : int, optional Number of most probable words to show from given `topic_id`. - .. note:: The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. + Notes + ----- + The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. Returns ------- @@ -685,7 +692,7 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No return hdp_formatter.show_topic(topic_id, topn, log, formatted) def get_topics(self): - """Returns the term topic matrix learned during inference. + """Get the term topic matrix learned during inference. Returns ------- @@ -708,10 +715,9 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): num_words : int, optional Number of most probable words to show from `num_topics` number of topics. log : bool, optional - Logs a message with level INFO on the logger object. + If True logs a message with level INFO on the logger object, False otherwise. formatted : bool, optional - Flag to determine whether to return the topics as a list of strings(T), or as lists of - (word, weight) pairs(F). + If True return the topics as a list of strings, False return the topics as lists of (word, weight) pairs. Returns ------- @@ -728,7 +734,9 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): def save_topics(self, doc_count=None): """Saves all the topics discovered. - .. note:: This is a legacy method; use `self.save()` instead. + Notes + ----- + This is a legacy method; use `self.save()` instead. Parameters ---------- @@ -751,7 +759,9 @@ def save_topics(self, doc_count=None): def save_options(self): """Writes all the values of the attributes for the current model in options.dat file. - .. note:: This is a legacy method; use `self.save()` instead. + Notes + ----- + This is a legacy method; use `self.save()` instead. """ if not self.outputdir: @@ -850,12 +860,13 @@ def evaluate_test_corpus(self, corpus): class HdpTopicFormatter(object): - """Helper class to format the output of topics and most probable words for display.""" + """Helper class for :class:`gensim.models.hdpmodel.HdpModel` to format the output of topics and most probable words + for display.""" (STYLE_GENSIM, STYLE_PRETTY) = (1, 2) def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None): - """Initialises the :class:`gensim.models.hdpmodel.HdpTopicFormatter` and stores topic data in sorted order. + """Initialise the :class:`gensim.models.hdpmodel.HdpTopicFormatter` and store topic data in sorted order. Parameters ---------- @@ -867,8 +878,7 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None File, filename, or generator to read. If the filename extension is .gz or .bz2, the file is first decompressed. Note that generators should return byte strings for Python 3k. style : bool, optional - Flag to determine whether to return the topics as a list of strings(T), or as lists of (word, weight) - pairs(F). + If True return the topics as a list of strings, False return the topics as lists of (word, weight) pairs. data: numpy.ndarray Sorted topic data in descending order of sum of probabilities for all words in corresponding topic. @@ -901,7 +911,7 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None self.style = style def print_topics(self, num_topics=10, num_words=10): - """Gives the most probable `num_words` words from `num_topics` topics. + """Give the most probable `num_words` words from `num_topics` topics. Parameters ---------- @@ -919,7 +929,7 @@ def print_topics(self, num_topics=10, num_words=10): return self.show_topics(num_topics, num_words, True) def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): - """Gives the most probable `num_words` words from `num_topics` topics. + """Give the most probable `num_words` words from `num_topics` topics. Parameters ---------- @@ -928,10 +938,10 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): num_words : int, optional Top `num_words` most probable words to be printed from each topic. log : bool, optional - Logs a message with level INFO on the logger object. + If True logs a message with level INFO on the logger object, False otherwise. formatted : bool, optional - Flag to determine whether to return the topics as a list of strings(T), or as lists of - (word, weight) pairs(F). + If True return the topics as a list of strings, False as lists of + (word, weight) pairs. Returns ------- @@ -967,10 +977,10 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): return shown def print_topic(self, topic_id, topn=None, num_words=None): - """Prints the `topn` most probable words from topic id `topic_id`. + """Print the `topn` most probable words from topic id `topic_id`. - Note - ---- + Notes + ----- The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. Parameters @@ -997,10 +1007,10 @@ def print_topic(self, topic_id, topn=None, num_words=None): return self.show_topic(topic_id, topn, formatted=True) def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None,): - """Gives the most probable `num_words` words for the id `topic_id`. + """Give the most probable `num_words` words for the id `topic_id`. - Note - ---- + Notes + ----- The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. Parameters @@ -1010,10 +1020,10 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No topn : int, optional Number of most probable words to show from given `topic_id`. log : bool, optional - Logs a message with level INFO on the logger object. + If True logs a message with level INFO on the logger object, False otherwise. formatted : bool, optional - Flag to determine whether to return the topics as a list of strings(T), or as lists of - (word, weight) pairs(F). + If True return the topics as a list of strings, False as lists of + (word, weight) pairs. num_words : int, optional Number of most probable words to show from given `topic_id`. @@ -1050,7 +1060,7 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No return topic[1] def show_topic_terms(self, topic_data, num_words): - """Gives the topic terms along with their probabilities for a single topic data. + """Give the topic terms along with their probabilities for a single topic data. Parameters ---------- @@ -1068,7 +1078,7 @@ def show_topic_terms(self, topic_data, num_words): return [(self.dictionary[wid], weight) for (weight, wid) in topic_data[:num_words]] def format_topic(self, topic_id, topic_terms): - """Formats the display for a single topic in two different ways. + """Format the display for a single topic in two different ways. Parameters ---------- From ce47b1de4211f2dc2b268ef83af3cb000013bd96 Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Tue, 20 Mar 2018 00:35:33 +0530 Subject: [PATCH 11/16] Add description for hdpmodel --- gensim/models/hdpmodel.py | 84 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 84 insertions(+) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 1ced5bb5b5..f8ac6a7808 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -164,6 +164,90 @@ def set_zero(self): class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """`Hierarchical Dirichlet Process model `_ + ** HIERARCHICAL DIRICHLET PROCESS ** + + Topic models promise to help summarize and organize large archives of texts that cannot be easily analyzed by hand. + Hierarchical Dirichlet process (HDP) is a powerful mixed-membership model for the unsupervised analysis of grouped + data. Unlike its finite counterpart, latent Dirichlet allocation, the HDP topic model infers the number of topics + from the data. Here we have used Online HDP, which provides the speed of online variational Bayes with the modeling + flexibility of the HDP. The idea behind Online variational Bayes in general is to optimize the variational + objective function with stochastic optimization.The challenge we face is that the existing coordinate ascent + variational Bayes algorithms for the HDP require complicated approximation methods or numerical optimization. This + model utilises stick breaking construction of Hdp which enables it to allow for coordinate-ascent variational Bayes + without numerical approximation. + + ** STICK BREAKING CONSTRUCTION ** + + To understand the HDP model we need to understand how it is modelled using the stick breaking construction. A very + good analogy to understand the stick breaking construction is chinese restaurant franchise. + + .. _Chinese restaurant franchise image: + + https://drive.google.com/file/d/1ci9xKyclN0xrhXqO4H8TsonPLiZFvANS/view?usp=sharing + + For this assume that there is a restaurant franchise(corpus) which has a large number of restaurants(documents,j) + under it. They have a global menu of dishes(topics, .. mat::\Phi_{k}) which they serve. Also, a single dish + (topic, .. mat::\Phi_{k} ) is only served at a single table t for all the customers(words,.. math::\theta _{j,i} ) + who sit at that table. So, when a customer enters the restaurant he/she has the choice to make where he/she wants + to sit. He/she can choose to sit at a table where some customers are already sitting , or he/she can choose to sit + at a new table. Here the probability of choosing each option is not same. + + Now, in this the global menu of dishes correspond to the global atoms .. mat::\Phi_{k} , and each restaurant + correspond to a single document j. So the number of dishes served in a particular restaurant correspond to the + number of topics in a particular document. And the number of people sitting at each table correspond to the number + of words belonging to each topic inside the document j. + + Now, coming on to the stick breaking construction, the concept understood from the chinese restaurant franchise is + easily carried over to the stick breaking construction for hdp. + + .. _Stick breaking construction of two level Hdp image: + + https://drive.google.com/file/d/1j1_OQohRX93Bi9Ashrb3hQFTUIXstU5b/view?usp=sharing + + A two level hierarchical dirichlet process is a collection of dirichlet processes .. math::G_{j} , one for each + group, which share a base distribution .. math::G_{0} , which is also a dirichlet process. Also, all .. math::G_{j} + ’s share the same set of atoms, .. mat::\Phi_{k} s, and only the atom weights( .. math::\pi _{jt} ) differs. + There will be multiple document-level atoms .. math::\psi_{jt} which map to the same corpus-level atom + .. mat::\Phi_{k} . Here, the .. math::\beta signify the weights given to each of the topics globally. Also, each + factor .. math::\theta _{j,i} is distributed according to .. math::G_{j}, i.e., it takes on the value of + .. mat::\Phi_{k} with probability .. math::\pi _{jt} . + .. math::C_{j,t} is an indicator variable whose value k signifies the index of .. mat::\Phi. This helps to map + .. math::\psi_{jt} to .. mat::\Phi_{k}. + The top level(corpus level) stick proportions correspond the values of .. math::\beta s, bottom level (document + level) stick proportions correspond to the values of .. math::\pi s. The truncation level for the corpus(K) and + document(T) corresponds to the number of .. math::\beta and .. math::\pi s which are in existence. + + Now, whenever coordinate ascent updates are to be performed, they happen at two level. The document level as well + as corpus level. + + At document level, we update the following: + 1. The parameters to the document level sticks, i.e, a and b parameters of .. math::\beta distribution of the + variable .. math::\pi _{jt} . + 2. The parameters to per word topic indicators, .. math::Z_{j,n}. Here .. math::Z_{j,n} selects topic + parameter .. math::\psi_{jt}. + 3. The parameters to per document topic indices( .. mat::\Phi_{jtk} ). + + At corpus level, we update the following: + 1. The parameters to the top level sticks, i.e., the parameters of the .. math::\beta distribution for the + corpus level .. math::\beta s, which signify the topic distribution at corpus level. + 2. The parameters to the topics .. mat::\Phi_{k} . + + Now coming on to the steps involved, procedure for online variational inference for the Hdp model is as follows: + 1. We initialise the corpus level parameters, topic parameters randomly and set current time to 1. + 2. Fetch a random document j from the corpus. + 3. Compute all the parameters required for document level updates. + 4. Compute natural gradients of corpus level parameters. + 5. Initialise the learning rate as a function of kappa, tau and current time. Also, + increment current time by 1 each time it reaches this step. + 6. Update corpus level parameters. + + Repeat 2 to 6 until stopping condition is not met. + + Here the stopping condition corresponds to + * time limit expired + * chunk limit reached + * whole corpus processed + Attributes ---------- lda_alpha : numpy.ndarray From 9b68b1625fcf44e64d1a6abb06ed7677da77137e Mon Sep 17 00:00:00 2001 From: gyanesh-m Date: Tue, 20 Mar 2018 01:48:21 +0530 Subject: [PATCH 12/16] fix minor error --- gensim/models/hdpmodel.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index f8ac6a7808..2c37a650cc 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -165,7 +165,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """`Hierarchical Dirichlet Process model `_ ** HIERARCHICAL DIRICHLET PROCESS ** - + Topic models promise to help summarize and organize large archives of texts that cannot be easily analyzed by hand. Hierarchical Dirichlet process (HDP) is a powerful mixed-membership model for the unsupervised analysis of grouped data. Unlike its finite counterpart, latent Dirichlet allocation, the HDP topic model infers the number of topics @@ -177,7 +177,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): without numerical approximation. ** STICK BREAKING CONSTRUCTION ** - + To understand the HDP model we need to understand how it is modelled using the stick breaking construction. A very good analogy to understand the stick breaking construction is chinese restaurant franchise. @@ -201,7 +201,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): easily carried over to the stick breaking construction for hdp. .. _Stick breaking construction of two level Hdp image: - + https://drive.google.com/file/d/1j1_OQohRX93Bi9Ashrb3hQFTUIXstU5b/view?usp=sharing A two level hierarchical dirichlet process is a collection of dirichlet processes .. math::G_{j} , one for each From 72d5f0fb081f3c6025500a943816426ef2afbfd9 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 2 Apr 2018 17:25:27 +0500 Subject: [PATCH 13/16] fix hdpmodel[1] --- gensim/models/hdpmodel.py | 54 ++++++++++++++++++--------------------- 1 file changed, 25 insertions(+), 29 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 2c37a650cc..afc035f657 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -21,23 +21,23 @@ -------- -#. Train :class:`~gensim.models.hdpmodel.HdpModel` +Train :class:`~gensim.models.hdpmodel.HdpModel` >>> from gensim.test.utils import common_corpus, common_dictionary >>> from gensim.models import HdpModel >>> >>> hdp = HdpModel(common_corpus, common_dictionary) -#. You can then infer topic distributions on new, unseen documents, with +You can then infer topic distributions on new, unseen documents, with >>> unseen_document = [(1, 3.), (2, 4)] >>> doc_hdp = hdp[unseen_document] -#. To print 20 topics with top 10 most probable words. +To print 20 topics with top 10 most probable words. >>> topic_info = hdp.print_topics(num_topics=20, num_words=10) -#. The model can be updated (trained) with new documents via +The model can be updated (trained) with new documents via >>> hdp.update([[(1, 2)], [(1, 1), (4, 5)]]) @@ -944,9 +944,7 @@ def evaluate_test_corpus(self, corpus): class HdpTopicFormatter(object): - """Helper class for :class:`gensim.models.hdpmodel.HdpModel` to format the output of topics and most probable words - for display.""" - + """Helper class for :class:`gensim.models.hdpmodel.HdpModel` to format the output of topics.""" (STYLE_GENSIM, STYLE_PRETTY) = (1, 2) def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None): @@ -958,18 +956,16 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None Dictionary for the input corpus. topic_data : numpy.ndarray, optional The term topic matrix. - topic_file : file, str, or pathlib.Path + topic_file : {file-like object, str, pathlib.Path} File, filename, or generator to read. If the filename extension is .gz or .bz2, the file is first decompressed. Note that generators should return byte strings for Python 3k. style : bool, optional - If True return the topics as a list of strings, False return the topics as lists of (word, weight) pairs. - data: numpy.ndarray - Sorted topic data in descending order of sum of probabilities for all words in corresponding topic. + If True - get the topics as a list of strings, otherwise - get the topics as lists of (word, weight) pairs. Raises ------ ValueError - Either no dictionary or no topic data. + Either dictionary is None or both `topic_data` and `topic_file` is None. """ if dictionary is None: @@ -996,6 +992,7 @@ def __init__(self, dictionary=None, topic_data=None, topic_file=None, style=None def print_topics(self, num_topics=10, num_words=10): """Give the most probable `num_words` words from `num_topics` topics. + Alias for :meth:`~gensim.models.hdpmodel.HdpTopicFormatter.show_topics`. Parameters ---------- @@ -1006,7 +1003,7 @@ def print_topics(self, num_topics=10, num_words=10): Returns ------- - list of tuple of (unicode,numpy.float64) or list of str + list of (str, numpy.float) **or** list of str Output format for `num_words` words from `num_topics` topics depends on the value of `self.style` attribute. """ @@ -1022,14 +1019,13 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): num_words : int, optional Top `num_words` most probable words to be printed from each topic. log : bool, optional - If True logs a message with level INFO on the logger object, False otherwise. + If True - log a message with level INFO on the logger object. formatted : bool, optional - If True return the topics as a list of strings, False as lists of - (word, weight) pairs. + If True - get the topics as a list of strings, otherwise as lists of (word, weight) pairs. Returns ------- - list of tuple of (int ,list of tuple of (unicode,numpy.float64) or list of str) + list of (int, list of (str, numpy.float) **or** list of str) Output format for terms from `num_topics` topics depends on the value of `self.style` attribute. """ @@ -1063,8 +1059,8 @@ def show_topics(self, num_topics=10, num_words=10, log=False, formatted=True): def print_topic(self, topic_id, topn=None, num_words=None): """Print the `topn` most probable words from topic id `topic_id`. - Notes - ----- + Warnings + -------- The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. Parameters @@ -1074,11 +1070,11 @@ def print_topic(self, topic_id, topn=None, num_words=None): topn : int, optional Number of most probable words to show from given `topic_id`. num_words : int, optional - Number of most probable words to show from given `topic_id`. + DEPRECATED, USE `topn` INSTEAD. Returns ------- - list of tuple of (unicode,numpy.float64) or list of str + list of (str, numpy.float) **or** list of str Output format for terms from a single topic depends on the value of `formatted` parameter. """ @@ -1093,8 +1089,8 @@ def print_topic(self, topic_id, topn=None, num_words=None): def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=None,): """Give the most probable `num_words` words for the id `topic_id`. - Notes - ----- + Warnings + -------- The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. Parameters @@ -1109,11 +1105,11 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No If True return the topics as a list of strings, False as lists of (word, weight) pairs. num_words : int, optional - Number of most probable words to show from given `topic_id`. + DEPRECATED, USE `topn` INSTEAD. Returns ------- - list of tuple of (unicode,numpy.float64) or list of str + list of (str, numpy.float) **or** list of str Output format for terms from a single topic depends on the value of `self.style` attribute. """ @@ -1148,14 +1144,14 @@ def show_topic_terms(self, topic_data, num_words): Parameters ---------- - topic_data : list of tuple of (unicode,numpy.float64) + topic_data : list of (str, numpy.float) Contains probabilities for each word id belonging to a single topic. num_words : int Number of words for which probabilities are to be extracted from the given single topic data. Returns ------- - list of tuple of (unicode,numpy.float64) + list of (str, numpy.float) A sequence of topic terms and their probabilities. """ @@ -1168,12 +1164,12 @@ def format_topic(self, topic_id, topic_terms): ---------- topic_id : int Acts as a representative index for a particular topic. - topic_terms : list of tuple of (unicode,numpy.float64) + topic_terms : list of (str, numpy.float) Contains the most probable words from a single topic. Returns ------- - list of tuple of (unicode,numpy.float64) or list of str + list of (str, numpy.float) **or** list of str Output format for topic terms depends on the value of `self.style` attribute. """ From 66d7b180c6c4bc8f52421d5f87188604292b3479 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 2 Apr 2018 18:54:50 +0500 Subject: [PATCH 14/16] fix hdpmodel[2] --- gensim/models/hdpmodel.py | 107 +++++++++++++++++++------------------- 1 file changed, 54 insertions(+), 53 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index afc035f657..8bc0a29f1a 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -105,8 +105,8 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): Returns ------- - tuple of numpy.ndarrays - Returns a tuple of (:math:`likelihood`, :math:`\\gamma`). + (numpy.ndarray, numpy.ndarray) + Computed (:math:`likelihood`, :math:`\\gamma`). """ gamma = np.ones(len(alpha)) @@ -136,8 +136,9 @@ def lda_e_step(doc_word_ids, doc_word_counts, alpha, beta, max_iter=100): class SuffStats(object): """Stores sufficient statistics for the current chunk of document(s) whenever Hdp model is updated with new corpus. These stats are used when updating lambda and top level sticks. The statistics include number of documents in the - chunk, length of words in the documents and top level truncation level.""" + chunk, length of words in the documents and top level truncation level. + """ def __init__(self, T, Wt, Dt): """ @@ -164,8 +165,6 @@ def set_zero(self): class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): """`Hierarchical Dirichlet Process model `_ - ** HIERARCHICAL DIRICHLET PROCESS ** - Topic models promise to help summarize and organize large archives of texts that cannot be easily analyzed by hand. Hierarchical Dirichlet process (HDP) is a powerful mixed-membership model for the unsupervised analysis of grouped data. Unlike its finite counterpart, latent Dirichlet allocation, the HDP topic model infers the number of topics @@ -176,77 +175,79 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): model utilises stick breaking construction of Hdp which enables it to allow for coordinate-ascent variational Bayes without numerical approximation. - ** STICK BREAKING CONSTRUCTION ** + **Stick breaking construction** To understand the HDP model we need to understand how it is modelled using the stick breaking construction. A very - good analogy to understand the stick breaking construction is chinese restaurant franchise. + good analogy to understand the stick breaking construction is `chinese restaurant franchise + `_. - .. _Chinese restaurant franchise image: - https://drive.google.com/file/d/1ci9xKyclN0xrhXqO4H8TsonPLiZFvANS/view?usp=sharing - - For this assume that there is a restaurant franchise(corpus) which has a large number of restaurants(documents,j) - under it. They have a global menu of dishes(topics, .. mat::\Phi_{k}) which they serve. Also, a single dish - (topic, .. mat::\Phi_{k} ) is only served at a single table t for all the customers(words,.. math::\theta _{j,i} ) - who sit at that table. So, when a customer enters the restaurant he/she has the choice to make where he/she wants - to sit. He/she can choose to sit at a table where some customers are already sitting , or he/she can choose to sit + For this assume that there is a restaurant franchise (`corpus`) which has a large number of restaurants + (`documents`, `j`) under it. They have a global menu of dishes (`topics`, :math:`\Phi_{k}`) which they serve. + Also, a single dish (`topic`, :math:`\Phi_{k}`) is only served at a single table `t` for all the customers + (`words`, :math:`\\theta_{j,i}`) who sit at that table. + So, when a customer enters the restaurant he/she has the choice to make where he/she wants to sit. + He/she can choose to sit at a table where some customers are already sitting , or he/she can choose to sit at a new table. Here the probability of choosing each option is not same. - Now, in this the global menu of dishes correspond to the global atoms .. mat::\Phi_{k} , and each restaurant - correspond to a single document j. So the number of dishes served in a particular restaurant correspond to the + Now, in this the global menu of dishes correspond to the global atoms :math:`\Phi_{k}`, and each restaurant + correspond to a single document `j`. So the number of dishes served in a particular restaurant correspond to the number of topics in a particular document. And the number of people sitting at each table correspond to the number - of words belonging to each topic inside the document j. + of words belonging to each topic inside the document `j`. Now, coming on to the stick breaking construction, the concept understood from the chinese restaurant franchise is - easily carried over to the stick breaking construction for hdp. + easily carried over to the stick breaking construction for hdp (`"Figure 1" from "Online Variational Inference + for the Hierarchical Dirichlet Process" `_). - .. _Stick breaking construction of two level Hdp image: + A two level hierarchical dirichlet process is a collection of dirichlet processes :math:`G_{j}` , one for each + group, which share a base distribution :math:`G_{0}`, which is also a dirichlet process. Also, all :math:`G_{j}` + share the same set of atoms, :math:`\Phi_{k}`, and only the atom weights :math:`\pi _{jt}` differs. - https://drive.google.com/file/d/1j1_OQohRX93Bi9Ashrb3hQFTUIXstU5b/view?usp=sharing + There will be multiple document-level atoms :math:`\psi_{jt}` which map to the same corpus-level atom + :math:`\Phi_{k}`. Here, the :math:`\\beta` signify the weights given to each of the topics globally. Also, each + factor :math:`\\theta_{j,i}` is distributed according to :math:`G_{j}`, i.e., it takes on the value of + :math:`\Phi_{k}` with probability :math:`\pi _{jt}`. :math:`C_{j,t}` is an indicator variable whose value `k` + signifies the index of :math:`\Phi`. This helps to map :math:`\psi_{jt}` to :math:`\Phi_{k}`. - A two level hierarchical dirichlet process is a collection of dirichlet processes .. math::G_{j} , one for each - group, which share a base distribution .. math::G_{0} , which is also a dirichlet process. Also, all .. math::G_{j} - ’s share the same set of atoms, .. mat::\Phi_{k} s, and only the atom weights( .. math::\pi _{jt} ) differs. - There will be multiple document-level atoms .. math::\psi_{jt} which map to the same corpus-level atom - .. mat::\Phi_{k} . Here, the .. math::\beta signify the weights given to each of the topics globally. Also, each - factor .. math::\theta _{j,i} is distributed according to .. math::G_{j}, i.e., it takes on the value of - .. mat::\Phi_{k} with probability .. math::\pi _{jt} . - .. math::C_{j,t} is an indicator variable whose value k signifies the index of .. mat::\Phi. This helps to map - .. math::\psi_{jt} to .. mat::\Phi_{k}. - The top level(corpus level) stick proportions correspond the values of .. math::\beta s, bottom level (document - level) stick proportions correspond to the values of .. math::\pi s. The truncation level for the corpus(K) and - document(T) corresponds to the number of .. math::\beta and .. math::\pi s which are in existence. + The top level (`corpus` level) stick proportions correspond the values of :math:`\\beta`, + bottom level (`document` level) stick proportions correspond to the values of :math:`\pi`. + The truncation level for the corpus (`K`) and document (`T`) corresponds to the number of :math:`\\beta` + and :math:`\pi` which are in existence. Now, whenever coordinate ascent updates are to be performed, they happen at two level. The document level as well as corpus level. At document level, we update the following: - 1. The parameters to the document level sticks, i.e, a and b parameters of .. math::\beta distribution of the - variable .. math::\pi _{jt} . - 2. The parameters to per word topic indicators, .. math::Z_{j,n}. Here .. math::Z_{j,n} selects topic - parameter .. math::\psi_{jt}. - 3. The parameters to per document topic indices( .. mat::\Phi_{jtk} ). + + #. The parameters to the document level sticks, i.e, a and b parameters of :math:`\\beta` distribution of the + variable :math:`\pi _{jt}`. + #. The parameters to per word topic indicators, :math:`Z_{j,n}`. Here :math:`Z_{j,n}` selects topic parameter + :math:`\psi_{jt}`. + #. The parameters to per document topic indices :math:`\Phi_{jtk}`. At corpus level, we update the following: - 1. The parameters to the top level sticks, i.e., the parameters of the .. math::\beta distribution for the - corpus level .. math::\beta s, which signify the topic distribution at corpus level. - 2. The parameters to the topics .. mat::\Phi_{k} . + + #. The parameters to the top level sticks, i.e., the parameters of the :math:`\\beta` distribution for the + corpus level :math:`\\beta`, which signify the topic distribution at corpus level. + #. The parameters to the topics :math:`\Phi_{k}`. Now coming on to the steps involved, procedure for online variational inference for the Hdp model is as follows: - 1. We initialise the corpus level parameters, topic parameters randomly and set current time to 1. - 2. Fetch a random document j from the corpus. - 3. Compute all the parameters required for document level updates. - 4. Compute natural gradients of corpus level parameters. - 5. Initialise the learning rate as a function of kappa, tau and current time. Also, - increment current time by 1 each time it reaches this step. - 6. Update corpus level parameters. + + 1. We initialise the corpus level parameters, topic parameters randomly and set current time to 1. + 2. Fetch a random document j from the corpus. + 3. Compute all the parameters required for document level updates. + 4. Compute natural gradients of corpus level parameters. + 5. Initialise the learning rate as a function of kappa, tau and current time. Also, increment current time by 1 + each time it reaches this step. + 6. Update corpus level parameters. Repeat 2 to 6 until stopping condition is not met. Here the stopping condition corresponds to - * time limit expired - * chunk limit reached - * whole corpus processed + + * time limit expired + * chunk limit reached + * whole corpus processed Attributes ---------- @@ -257,7 +258,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): m_D : int Number of documents in the corpus. m_Elogbeta : numpy.ndarray: - Stores value of dirichlet expectationn, i.e., compute :math:`E[log \\theta]` for a vector + Stores value of dirichlet expectation, i.e., compute :math:`E[log \\theta]` for a vector :math:`\\theta \sim Dir(\\alpha)`. m_lambda : {numpy.ndarray, float} Drawn samples from the parameterized gamma distribution. @@ -274,7 +275,7 @@ class HdpModel(interfaces.TransformationABC, basemodel.BaseTopicModel): m_timestamp : numpy.ndarray Helps to keep track and perform lazy updates on lambda. m_updatect : int - Keeps track of current time and is incremented everytime :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda` + Keeps track of current time and is incremented every time :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda` is called. m_var_sticks : numpy.ndarray Array of values for stick. From 2e8249b68a2f8caaaecea44a8ac752d16c756797 Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 2 Apr 2018 19:52:25 +0500 Subject: [PATCH 15/16] fix hdpmodel[3] --- gensim/models/hdpmodel.py | 122 +++++++++++++++++++------------------- 1 file changed, 61 insertions(+), 61 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 8bc0a29f1a..1fc914f46c 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -290,6 +290,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, gamma=1, eta=0.01, scale=1.0, var_converge=0.0001, outputdir=None, random_state=None): """ + Parameters ---------- corpus : iterable of list of (int, float) @@ -298,11 +299,11 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, Dictionary for the input corpus. max_chunks : int, optional Upper bound on how many chunks to process. It wraps around corpus beginning in another corpus pass, - if there are not enough chunks in the corpusю + if there are not enough chunks in the corpus. max_time : int, optional Upper bound on time (in seconds) for which model will be trained. chunksize : int, optional - Number of documents in one chunck + Number of documents in one chuck. kappa: float,optional Learning parameter which acts as exponential decay factor to influence extent of learning from each batch. tau: float, optional @@ -326,7 +327,7 @@ def __init__(self, corpus, id2word, max_chunks=None, max_time=None, Stores topic and options information in the specified directory. random_state : {None, int, array_like, :class:`~np.random.RandomState`, optional} Adds a little random jitter to randomize results around same alpha when trying to fetch a closest - corrsponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model` + corresponding lda model from :meth:`~gensim.models.hdpmodel.HdpModel.suggested_lda_model` """ self.corpus = corpus @@ -419,15 +420,16 @@ def __getitem__(self, bow, eps=0.01): Parameters ---------- - bow : iterable of list of (int, float) - Bag-of-words representation of the document to get topics for. + bow : {iterable of list of (int, float), list of (int, float) + BoW representation of the document/corpus to get topics for. eps : float, optional Ignore topics with probability below `eps`. Returns ------- - list of (int, float) - topic distribution for the given document `bow`, as a list of `(topic_id, topic_probability)` 2-tuples. + list of (int, float) **or** :class:`gensim.interfaces.TransformedCorpus` + Topic distribution for the given document/corpus `bow`, as a list of `(topic_id, topic_probability)` or + transformed corpus """ is_corpus, corpus = utils.is_corpus(bow) @@ -439,16 +441,16 @@ def __getitem__(self, bow, eps=0.01): return [(topicid, topicvalue) for topicid, topicvalue in enumerate(topic_dist) if topicvalue >= eps] def update(self, corpus): - """Train the model with new documents, by EM-iterating over `corpus` until - any of the conditions is satisfied + """Train the model with new documents, by EM-iterating over `corpus` until any of the conditions is satisfied. + * time limit expired * chunk limit reached * whole corpus processed Parameters ---------- - corpus : list of list of tuple of ints; [ [ (int,int) ]] - The corpus on which Hdp model will be updated. + corpus : iterable of list of (int, float) + Corpus in BoW format. """ save_freq = max(1, int(10000 / self.chunksize)) # save every 10k docs, roughly @@ -478,14 +480,14 @@ def update(self, corpus): logger.info('PROGRESS: finished document %i of %i', self.m_num_docs_processed, self.m_D) def update_finished(self, start_time, chunks_processed, docs_processed): - """Flag to determine whether the Hdp model has been updated with the new corpus or not. + """Flag to determine whether the model has been updated with the new corpus or not. Parameters ---------- start_time : float - Indicates the current processor time as a floating point number expressed in seconds. The resolution is - typically better on Windows than on Unix by one microsecond due to differing implementation of underlying - function calls. + Indicates the current processor time as a floating point number expressed in seconds. + The resolution is typically better on Windows than on Unix by one microsecond due to differing + implementation of underlying function calls. chunks_processed : int Indicates progress of the update in terms of the number of chunks processed. docs_processed : int @@ -494,7 +496,7 @@ def update_finished(self, start_time, chunks_processed, docs_processed): Returns ------- bool - If True Hdp model is updated, False otherwise. + If True - model is updated, False otherwise. """ return ( @@ -512,17 +514,17 @@ def update_chunk(self, chunk, update=True, opt_o=True): Parameters ---------- - chunk : list of list of tuple of ints; [ [ (int,int) ]] - The chunk of corpus on which Hdp model will be updated. + chunk : iterable of list of (int, float) + Corpus in BoW format. update : bool, optional - If True then update lambda, False don't update lambda. + If True - call :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda`. opt_o : bool, optional - Passed as argument to :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda`. If True then the topics will - be ordered, False otherwise. + Passed as argument to :meth:`~gensim.models.hdpmodel.HdpModel.update_lambda`. + If True then the topics will be ordered, False otherwise. Returns ------- - tuple of (float,int) + (float, int) A tuple of likelihood and sum of all the word counts from each document in the corpus. """ @@ -568,23 +570,23 @@ def update_chunk(self, chunk, update=True, opt_o=True): return score, count def doc_e_step(self, ss, Elogsticks_1st, unique_words, doc_word_ids, doc_word_counts, var_converge): - """Performs e step for a single doc. + """Performs E step for a single doc. Parameters ---------- ss : :class:`~gensim.models.hdpmodel.SuffStats` - Suffstats for all document(s) in the chunk. + Stats for all document(s) in the chunk. Elogsticks_1st : numpy.ndarray Computed Elogsticks value by stick-breaking process. - unique_words : int + unique_words : dict of (int, int) Number of unique words in the chunk. - doc_word_ids : tuple of int + doc_word_ids : iterable of int Word ids of for a single document. - doc_word_counts : tuple of int + doc_word_counts : iterable of int Word counts of all words in a single document. - var_converge : float, optional - Lower bound on the right side of convergence. Used when updating variational parameters for a - single document. + var_converge : float + Lower bound on the right side of convergence. Used when updating variational parameters for a single + document. Returns ------- @@ -674,17 +676,16 @@ def doc_e_step(self, ss, Elogsticks_1st, unique_words, doc_word_ids, doc_word_co return likelihood def update_lambda(self, sstats, word_list, opt_o): - """Updates appropriate columns of lambda and top level sticks based on documents. + """Update appropriate columns of lambda and top level sticks based on documents. Parameters ---------- sstats : :class:`~gensim.models.hdpmodel.SuffStats` - Suffstats for all document(s) in the chunk. + Statistic for all document(s) in the chunk. word_list : list of int Contains word id of all the unique words in the chunk of documents on which update is being performed. opt_o : bool, optional - If True invokes a call to :meth:`~gensim.models.hdpmodel.HdpModel.optimal_ordering` to order the topics, - False otherwise. + If True - invokes a call to :meth:`~gensim.models.hdpmodel.HdpModel.optimal_ordering` to order the topics. """ self.m_status_up_to_date = False @@ -748,19 +749,19 @@ def show_topic(self, topic_id, topn=20, log=False, formatted=False, num_words=No topn : int, optional Number of most probable words to show from given `topic_id`. log : bool, optional - If True logs a message with level INFO on the logger object, False otherwise + If True - logs a message with level INFO on the logger object. formatted : bool, optional - If True return the topics as a list of strings, False return the topics as lists of (weight, word) pairs. + If True - get the topics as a list of strings, otherwise - get the topics as lists of (weight, word) pairs. num_words : int, optional - Number of most probable words to show from given `topic_id`. + DEPRECATED, USE `topn` INSTEAD. - Notes - ----- + Warnings + -------- The parameter `num_words` is deprecated, will be removed in 4.0.0, please use `topn` instead. Returns ------- - list of tuple of (unicode,numpy.float64) or list of str + list of (str, numpy.float) **or** list of str Topic terms output displayed whose format depends on `formatted` parameter. """ @@ -790,23 +791,21 @@ def get_topics(self): def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): """Print the `num_words` most probable words for `num_topics` number of topics. - Set `num_topics=-1` to print all topics.Set `formatted=True` to return the topics as a list of strings, or - `False` as lists of (word, weight) pairs. Parameters ---------- num_topics : int, optional - Number of topics for which most probable `num_words` words will be fetched. + Number of topics for which most probable `num_words` words will be fetched, if -1 - print all topics. num_words : int, optional Number of most probable words to show from `num_topics` number of topics. log : bool, optional - If True logs a message with level INFO on the logger object, False otherwise. + If True - log a message with level INFO on the logger object. formatted : bool, optional - If True return the topics as a list of strings, False return the topics as lists of (word, weight) pairs. + If True - get the topics as a list of strings, otherwise - get the topics as lists of (weight, word) pairs. Returns ------- - list of tuple of (unicode,numpy.float64) or list of str + list of (str, numpy.float) **or** list of str Output format for topic terms depends on the value of `formatted` parameter. """ @@ -817,11 +816,11 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): return hdp_formatter.show_topics(num_topics, num_words, log, formatted) def save_topics(self, doc_count=None): - """Saves all the topics discovered. + """Save discovered topics. - Notes - ----- - This is a legacy method; use `self.save()` instead. + Warnings + -------- + This method is deprecated, use :meth:`~gensim.models.hdpmodel.HdpModel.save` instead. Parameters ---------- @@ -842,11 +841,11 @@ def save_topics(self, doc_count=None): np.savetxt(fname, betas) def save_options(self): - """Writes all the values of the attributes for the current model in options.dat file. + """Writes all the values of the attributes for the current model in "options.dat" file. - Notes - ----- - This is a legacy method; use `self.save()` instead. + Warnings + -------- + This method is deprecated, use :meth:`~gensim.models.hdpmodel.HdpModel.save` instead. """ if not self.outputdir: @@ -867,12 +866,12 @@ def save_options(self): fout.write('gamma: %s\n' % str(self.m_gamma)) def hdp_to_lda(self): - """Only returns corresponding alpha, beta values of a LDA almost equivalent to current HDP. + """Get corresponding alpha and beta values of a LDA almost equivalent to current HDP. Returns ------- - tuple of numpy.ndarray - Tuple of numpy arrays of alpha and beta. + (numpy.ndarray, numpy.ndarray) + Alpha and Beta arrays. """ # alpha @@ -891,8 +890,9 @@ def hdp_to_lda(self): return alpha, beta def suggested_lda_model(self): - """Returns a trained ldamodel object which is closest to the current hdp model.The num_topics is m_T - (default is 150) so as to preserve the matrice shapes when we assign alpha and beta. + """Get a trained ldamodel object which is closest to the current hdp model. + + The `num_topics=m_T`, so as to preserve the matrices shapes when we assign alpha and beta. Returns ------- @@ -912,8 +912,8 @@ def evaluate_test_corpus(self, corpus): Parameters ---------- - corpus : list of list of tuple of ints; [ [ (int,int) ]] - The corpus on which Hdp model will be tested. + corpus : iterable of list of (int, float) + Test corpus in BoW format. Returns ------- From 0256d7cff5715c8ee65dbfe06ac68cc5129f343b Mon Sep 17 00:00:00 2001 From: ivan Date: Mon, 2 Apr 2018 19:56:58 +0500 Subject: [PATCH 16/16] fix hdpmodel[4] --- gensim/models/hdpmodel.py | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/gensim/models/hdpmodel.py b/gensim/models/hdpmodel.py index 1fc914f46c..6d0bfbce56 100755 --- a/gensim/models/hdpmodel.py +++ b/gensim/models/hdpmodel.py @@ -14,13 +14,12 @@ `_. The core estimation code is directly adapted from the `blei-lab/online-hdp `_ -from `Wang, Paisley, Blei: Online Variational Inference for the Hierarchical Dirichlet Process, JMLR (2011) +from `Wang, Paisley, Blei: "Online Variational Inference for the Hierarchical Dirichlet Process", JMLR (2011) `_. Examples -------- - Train :class:`~gensim.models.hdpmodel.HdpModel` >>> from gensim.test.utils import common_corpus, common_dictionary @@ -56,6 +55,8 @@ from gensim.matutils import dirichlet_expectation from gensim.models import basemodel, ldamodel +from gensim.utils import deprecated + logger = logging.getLogger(__name__) meanchangethresh = 0.00001 @@ -815,6 +816,7 @@ def show_topics(self, num_topics=20, num_words=20, log=False, formatted=True): hdp_formatter = HdpTopicFormatter(self.id2word, betas) return hdp_formatter.show_topics(num_topics, num_words, log, formatted) + @deprecated("This method will be removed in 4.0.0, use `save` instead.") def save_topics(self, doc_count=None): """Save discovered topics. @@ -840,6 +842,7 @@ def save_topics(self, doc_count=None): betas = self.m_lambda + self.m_eta np.savetxt(fname, betas) + @deprecated("This method will be removed in 4.0.0, use `save` instead.") def save_options(self): """Writes all the values of the attributes for the current model in "options.dat" file.