Rebuilt docs

Includes docs for new module types added (Gensim) and example pipeline.
markgw · Aug 12, 2020 · 1600358 · 1600358
1 parent ee8eac6
commit 1600358
Show file tree

Hide file tree

Showing 17 changed files with 382 additions and 0 deletions.
diff --git a/docs/api/pimlico.datatypes.corpora.rst b/docs/api/pimlico.datatypes.corpora.rst
@@ -19,6 +19,7 @@ Submodules
    pimlico.datatypes.corpora.grouped
    pimlico.datatypes.corpora.ints
    pimlico.datatypes.corpora.json
+   pimlico.datatypes.corpora.strings
    pimlico.datatypes.corpora.table
    pimlico.datatypes.corpora.tokenized
    pimlico.datatypes.corpora.word_annotations

diff --git a/docs/api/pimlico.datatypes.corpora.strings.rst b/docs/api/pimlico.datatypes.corpora.strings.rst
@@ -0,0 +1,7 @@
+strings
+=======
+
+.. automodule:: pimlico.datatypes.corpora.strings
+    :members:
+    :undoc-members:
+    :show-inheritance:
diff --git a/docs/example_config/index.rst b/docs/example_config/index.rst
@@ -18,6 +18,7 @@ Available pipelines
    :titlesonly:
 
    empty.rst
+   topic_modelling.train_tms.rst
    simple.custom_module.rst
    simple.tokenize.rst
 

diff --git a/docs/example_config/module_list.tsv b/docs/example_config/module_list.tsv
@@ -1,3 +1,4 @@
 example-pipeline-empty-test	
+example-pipeline-train-tms-example	pimlico.modules.corpora.store, pimlico.modules.spacy.tokenize, pimlico.modules.text.normalize, pimlico.modules.corpora.vocab_builder, pimlico.modules.corpora.vocab_mapper, pimlico.modules.gensim.lda, pimlico.modules.gensim.ldaseq, pimlico.modules.gensim.ldaseq_doc_topics
 example-pipeline-custom-module-example	pimlico.modules.spacy.tokenize, pimlico.modules.corpora.vocab_builder
 example-pipeline-tokenize-example	pimlico.modules.text.simple_tokenize
diff --git a/docs/example_config/topic_modelling.train_tms.rst b/docs/example_config/topic_modelling.train_tms.rst
@@ -0,0 +1,130 @@
+.. _example-pipeline-train-tms-example:
+
+train\_tms\_example
+~~~~~~~~~~~~~~~~~~~
+
+
+
+This is an example Pimlico pipeline.
+
+The complete config file for this example pipeline is below. `Source file <https://github.com/markgw/pimlico/blob/master/examples/topic_modelling/train_tms.conf>`_
+
+An example pipeline that loads some textual data and trains topic
+models on it using Gensim.
+
+See the ``src/`` subdirectory for the module's code.
+
+Pipeline config
+===============
+
+.. code-block:: ini
+   
+   [pipeline]
+   name=train_tms_example
+   release=latest
+   # We need a path to Python code here, since we use a custom module type
+   python_path=src/
+   
+   [vars]
+   # Here we define where the example input corpus can be found
+   corpus_path=%(pimlico_root)s/examples/data/input/ubuntu_dialogue/dialogues_small.json
+   
+   # Read in the raw text from the JSON files
+   [input_text]
+   type=tm_example.modules.input.ubuntu_dialogue
+   path=%(corpus_path)s
+   # Just use a small number of documents so we can train fast
+   # You should use a much bigger corpus for a real model
+   limit=600
+   
+   # Also read in a label for each document consisting of the year+month from
+   # the timestamp
+   [input_labels]
+   type=tm_example.modules.input.ubuntu_dialogue_months
+   path=%(corpus_path)s
+   limit=600
+   
+   [store_labels]
+   type=pimlico.modules.corpora.store
+   input=input_labels
+   
+   # Tokenize the text using a simple tokenizer from NLTK
+   [tokenize]
+   type=pimlico.modules.spacy.tokenize
+   input=input_text
+   
+   # Apply simple text normalization
+   # In a real topic modelling application, you might want to do lemmatization
+   #  or other types of more sophisticated normalization here
+   [normalize]
+   type=pimlico.modules.text.normalize
+   case=lower
+   min_word_length=3
+   remove_empty=T
+   remove_only_punct=T
+   
+   # Build a vocabulary from the words used in the corpus
+   # This is used to map words in the corpus to IDs
+   [vocab]
+   type=pimlico.modules.corpora.vocab_builder
+   input=normalize
+   # Only include words that occur at least 5 times
+   threshold=5
+   
+   [ids]
+   type=pimlico.modules.corpora.vocab_mapper
+   input_vocab=vocab
+   input_text=normalize
+   # Skip any OOV words (below the threshold)
+   oov=ignore
+   
+   # First train a plain LDA model using Gensim
+   [lda]
+   type=pimlico.modules.gensim.lda
+   input_vocab=vocab
+   input_corpus=ids
+   tfidf=T
+   # Small number of topics: you probably want more in practice
+   num_topics=5
+   passes=10
+   
+   # Also train a dynamic topic model (DTM), with a separate model
+   # for each month
+   [dtm]
+   type=pimlico.modules.gensim.ldaseq
+   input_corpus=ids
+   input_labels=input_labels
+   input_vocab=vocab
+   # Small number of topics: you probably want more in practice
+   num_topics=5
+   # Apply TF-IDF transformation to bags of words before training
+   tfidf=T
+   # Speed up training for this demo by reducing iterations
+   em_min_iter=3
+   em_max_iter=8
+   
+   # Apply stationary DTM inference to all of the documents
+   # This doesn't need to be run on the same document set we trained on:
+   #  we do that here just as an example
+   [dtm_infer]
+   type=pimlico.modules.gensim.ldaseq_doc_topics
+   input_corpus=ids
+   input_labels=input_labels
+   input_model=dtm
+
+Modules
+=======
+
+
+The following Pimlico module types are used in this pipeline:
+
+ * :mod:`pimlico.modules.corpora.store`
+ * :mod:`pimlico.modules.spacy.tokenize`
+ * :mod:`pimlico.modules.text.normalize`
+ * :mod:`pimlico.modules.corpora.vocab_builder`
+ * :mod:`pimlico.modules.corpora.vocab_mapper`
+ * :mod:`pimlico.modules.gensim.lda`
+ * :mod:`pimlico.modules.gensim.ldaseq`
+ * :mod:`pimlico.modules.gensim.ldaseq_doc_topics`
+
+
diff --git a/docs/modules/pimlico.modules.corpora.store.rst b/docs/modules/pimlico.modules.corpora.store.rst
@@ -49,6 +49,13 @@ This is an example of how this module can be used in a pipeline config file.
    input_corpus=module_a.some_output
    
 
+Example pipelines
+=================
+
+This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
+
+ * :ref:`example-pipeline-train-tms-example`
+
 Test pipelines
 ==============
 

diff --git a/docs/modules/pimlico.modules.corpora.vocab_builder.rst b/docs/modules/pimlico.modules.corpora.vocab_builder.rst
@@ -84,6 +84,7 @@ Example pipelines
 
 This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
 
+ * :ref:`example-pipeline-train-tms-example`
  * :ref:`example-pipeline-custom-module-example`
 
 Test pipelines

diff --git a/docs/modules/pimlico.modules.corpora.vocab_mapper.rst b/docs/modules/pimlico.modules.corpora.vocab_mapper.rst
@@ -80,6 +80,13 @@ This example usage includes more options.
    oov=value
    row_length_bytes=2
 
+Example pipelines
+=================
+
+This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
+
+ * :ref:`example-pipeline-train-tms-example`
+
 Test pipelines
 ==============
 

diff --git a/docs/modules/pimlico.modules.gensim.lda.rst b/docs/modules/pimlico.modules.gensim.lda.rst
@@ -122,3 +122,10 @@ This example usage includes more options.
    tfidf=F
    update_every=1
 
+Example pipelines
+=================
+
+This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
+
+ * :ref:`example-pipeline-train-tms-example`
+
diff --git a/docs/modules/pimlico.modules.gensim.ldaseq_doc_topics.rst b/docs/modules/pimlico.modules.gensim.ldaseq_doc_topics.rst
@@ -0,0 +1,77 @@
+LDA\-seq \(DTM\) document topic analysis
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. py:module:: pimlico.modules.gensim.ldaseq_doc_topics
+
++------------+------------------------------------------+
+| Path       | pimlico.modules.gensim.ldaseq_doc_topics |
++------------+------------------------------------------+
+| Executable | yes                                      |
++------------+------------------------------------------+
+
+Takes a trained DTM model and produces the topic vector for every document in a corpus.
+
+The corpus is given as integer lists documents, which are the integer IDs of the words
+in each sentence of each document. It is assumed that the corpus uses the same vocabulary
+to map to integer IDs as the LDA model's training corpus, so no further mapping needs to
+be done.
+
+We also require a corpus of labels to say what time slice each document is in. These
+should be from the same set of labels that the DTM model was trained on, so that each
+document label can be mapped to a trained slice.
+
+Does not support Python 2 since Gensim has dropped Python 2 support.
+
+
+*This module does not support Python 2, so can only be used when Pimlico is being run under Python 3*
+
+Inputs
+======
+
++--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name   | Type(s)                                                                                                                                                                 |
++========+=========================================================================================================================================================================+
+| corpus | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`IntegerListsDocumentType <pimlico.datatypes.corpora.ints.IntegerListsDocumentType>`> |
++--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| labels | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`LabelDocumentType <pimlico.datatypes.corpora.strings.LabelDocumentType>`>            |
++--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| model  | :class:`ldaseq_model <pimlico.datatypes.gensim.GensimLdaSeqModel>`                                                                                                      |
++--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Outputs
+=======
+
++---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
+| Name    | Type(s)                                                                                                                                                       |
++=========+===============================================================================================================================================================+
+| vectors | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`VectorDocumentType <pimlico.datatypes.corpora.floats.VectorDocumentType>`> |
++---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+
+
+Example config
+==============
+
+This is an example of how this module can be used in a pipeline config file.
+
+.. code-block:: ini
+   
+   [my_ldaseq_doc_topics_module]
+   type=pimlico.modules.gensim.ldaseq_doc_topics
+   input_corpus=module_a.some_output
+   input_labels=module_a.some_output
+   input_model=module_a.some_output
+   
+
+Example pipelines
+=================
+
+This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
+
+ * :ref:`example-pipeline-train-tms-example`
+
+Test pipelines
+==============
+
+This module is used by the following :ref:`test pipelines <test-pipelines>`. They are a further source of examples of the module's usage.
+
+ * :ref:`test-config-gensim-dtm_infer.conf`
+
diff --git a/docs/modules/pimlico.modules.gensim.rst b/docs/modules/pimlico.modules.gensim.rst
@@ -16,3 +16,4 @@ from `Gensim <https://radimrehurek.com/gensim/>`_.
 
    pimlico.modules.gensim.lda
    pimlico.modules.gensim.lda_doc_topics
+   pimlico.modules.gensim.ldaseq_doc_topics
diff --git a/docs/modules/pimlico.modules.spacy.tokenize.rst b/docs/modules/pimlico.modules.spacy.tokenize.rst
@@ -69,6 +69,7 @@ Example pipelines
 
 This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
 
+ * :ref:`example-pipeline-train-tms-example`
  * :ref:`example-pipeline-custom-module-example`
 
 Test pipelines

diff --git a/docs/modules/pimlico.modules.text.normalize.rst b/docs/modules/pimlico.modules.text.normalize.rst
@@ -88,6 +88,13 @@ This example usage includes more options.
    remove_only_punct=F
    remove_punct=F
 
+Example pipelines
+=================
+
+This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline.
+
+ * :ref:`example-pipeline-train-tms-example`
+
 Test pipelines
 ==============
 

diff --git a/docs/test_config/gensim.dtm_infer.conf.rst b/docs/test_config/gensim.dtm_infer.conf.rst
@@ -0,0 +1,63 @@
+.. _test-config-gensim-dtm_infer.conf:
+
+dtm\_infer
+~~~~~~~~~~
+
+
+
+This is one of the test pipelines included in Pimlico's repository.
+See :ref:`test-pipelines` for more details.
+
+Config file
+===========
+
+The complete config file for this test pipeline:
+
+
+.. code-block:: ini
+   
+   # Take a trained DTM model and perform inference on other docs
+   #
+   # For a fuller example (on which this test is based), see
+   # :doc:`the topic model training example </example_config/topic_modelling.train_tms>`.
+   
+   [pipeline]
+   name=dtm_infer
+   release=latest
+   
+   # Load word IDs
+   [ids]
+   type=pimlico.datatypes.corpora.GroupedCorpus
+   data_point_type=IntegerListsDocumentType
+   dir=%(test_data_dir)s/datasets/corpora/ids_ubuntu
+   
+   # Load slice labels
+   [labels]
+   type=pimlico.datatypes.corpora.GroupedCorpus
+   data_point_type=LabelDocumentType
+   dir=%(test_data_dir)s/datasets/corpora/labels_ubuntu
+   
+   # Load a trained DTM model
+   [dtm]
+   type=pimlico.datatypes.gensim.GensimLdaSeqModel
+   dir=%(test_data_dir)s/datasets/dtm_model
+   
+   # Apply stationary DTM inference to all of the documents
+   # This doesn't need to be run on the same document set we trained on:
+   #  we do that here just as an example
+   [dtm_infer]
+   type=pimlico.modules.gensim.ldaseq_doc_topics
+   input_corpus=ids
+   input_labels=labels
+   input_model=dtm
+
+
+Modules
+=======
+
+
+The following Pimlico module types are used in this pipeline:
+
+ * :mod:`pimlico.modules.gensim.ldaseq_doc_topics`
+
+