-
Notifications
You must be signed in to change notification settings - Fork 1
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Includes docs for new module types added (Gensim) and example pipeline.
- Loading branch information
Showing
17 changed files
with
382 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,7 @@ | ||
strings | ||
======= | ||
|
||
.. automodule:: pimlico.datatypes.corpora.strings | ||
:members: | ||
:undoc-members: | ||
:show-inheritance: |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,3 +1,4 @@ | ||
example-pipeline-empty-test | ||
example-pipeline-train-tms-example pimlico.modules.corpora.store, pimlico.modules.spacy.tokenize, pimlico.modules.text.normalize, pimlico.modules.corpora.vocab_builder, pimlico.modules.corpora.vocab_mapper, pimlico.modules.gensim.lda, pimlico.modules.gensim.ldaseq, pimlico.modules.gensim.ldaseq_doc_topics | ||
example-pipeline-custom-module-example pimlico.modules.spacy.tokenize, pimlico.modules.corpora.vocab_builder | ||
example-pipeline-tokenize-example pimlico.modules.text.simple_tokenize |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,130 @@ | ||
.. _example-pipeline-train-tms-example: | ||
|
||
train\_tms\_example | ||
~~~~~~~~~~~~~~~~~~~ | ||
|
||
|
||
|
||
This is an example Pimlico pipeline. | ||
|
||
The complete config file for this example pipeline is below. `Source file <https://github.com/markgw/pimlico/blob/master/examples/topic_modelling/train_tms.conf>`_ | ||
|
||
An example pipeline that loads some textual data and trains topic | ||
models on it using Gensim. | ||
|
||
See the ``src/`` subdirectory for the module's code. | ||
|
||
Pipeline config | ||
=============== | ||
|
||
.. code-block:: ini | ||
[pipeline] | ||
name=train_tms_example | ||
release=latest | ||
# We need a path to Python code here, since we use a custom module type | ||
python_path=src/ | ||
[vars] | ||
# Here we define where the example input corpus can be found | ||
corpus_path=%(pimlico_root)s/examples/data/input/ubuntu_dialogue/dialogues_small.json | ||
# Read in the raw text from the JSON files | ||
[input_text] | ||
type=tm_example.modules.input.ubuntu_dialogue | ||
path=%(corpus_path)s | ||
# Just use a small number of documents so we can train fast | ||
# You should use a much bigger corpus for a real model | ||
limit=600 | ||
# Also read in a label for each document consisting of the year+month from | ||
# the timestamp | ||
[input_labels] | ||
type=tm_example.modules.input.ubuntu_dialogue_months | ||
path=%(corpus_path)s | ||
limit=600 | ||
[store_labels] | ||
type=pimlico.modules.corpora.store | ||
input=input_labels | ||
# Tokenize the text using a simple tokenizer from NLTK | ||
[tokenize] | ||
type=pimlico.modules.spacy.tokenize | ||
input=input_text | ||
# Apply simple text normalization | ||
# In a real topic modelling application, you might want to do lemmatization | ||
# or other types of more sophisticated normalization here | ||
[normalize] | ||
type=pimlico.modules.text.normalize | ||
case=lower | ||
min_word_length=3 | ||
remove_empty=T | ||
remove_only_punct=T | ||
# Build a vocabulary from the words used in the corpus | ||
# This is used to map words in the corpus to IDs | ||
[vocab] | ||
type=pimlico.modules.corpora.vocab_builder | ||
input=normalize | ||
# Only include words that occur at least 5 times | ||
threshold=5 | ||
[ids] | ||
type=pimlico.modules.corpora.vocab_mapper | ||
input_vocab=vocab | ||
input_text=normalize | ||
# Skip any OOV words (below the threshold) | ||
oov=ignore | ||
# First train a plain LDA model using Gensim | ||
[lda] | ||
type=pimlico.modules.gensim.lda | ||
input_vocab=vocab | ||
input_corpus=ids | ||
tfidf=T | ||
# Small number of topics: you probably want more in practice | ||
num_topics=5 | ||
passes=10 | ||
# Also train a dynamic topic model (DTM), with a separate model | ||
# for each month | ||
[dtm] | ||
type=pimlico.modules.gensim.ldaseq | ||
input_corpus=ids | ||
input_labels=input_labels | ||
input_vocab=vocab | ||
# Small number of topics: you probably want more in practice | ||
num_topics=5 | ||
# Apply TF-IDF transformation to bags of words before training | ||
tfidf=T | ||
# Speed up training for this demo by reducing iterations | ||
em_min_iter=3 | ||
em_max_iter=8 | ||
# Apply stationary DTM inference to all of the documents | ||
# This doesn't need to be run on the same document set we trained on: | ||
# we do that here just as an example | ||
[dtm_infer] | ||
type=pimlico.modules.gensim.ldaseq_doc_topics | ||
input_corpus=ids | ||
input_labels=input_labels | ||
input_model=dtm | ||
Modules | ||
======= | ||
|
||
|
||
The following Pimlico module types are used in this pipeline: | ||
|
||
* :mod:`pimlico.modules.corpora.store` | ||
* :mod:`pimlico.modules.spacy.tokenize` | ||
* :mod:`pimlico.modules.text.normalize` | ||
* :mod:`pimlico.modules.corpora.vocab_builder` | ||
* :mod:`pimlico.modules.corpora.vocab_mapper` | ||
* :mod:`pimlico.modules.gensim.lda` | ||
* :mod:`pimlico.modules.gensim.ldaseq` | ||
* :mod:`pimlico.modules.gensim.ldaseq_doc_topics` | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,77 @@ | ||
LDA\-seq \(DTM\) document topic analysis | ||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ | ||
|
||
.. py:module:: pimlico.modules.gensim.ldaseq_doc_topics | ||
+------------+------------------------------------------+ | ||
| Path | pimlico.modules.gensim.ldaseq_doc_topics | | ||
+------------+------------------------------------------+ | ||
| Executable | yes | | ||
+------------+------------------------------------------+ | ||
|
||
Takes a trained DTM model and produces the topic vector for every document in a corpus. | ||
|
||
The corpus is given as integer lists documents, which are the integer IDs of the words | ||
in each sentence of each document. It is assumed that the corpus uses the same vocabulary | ||
to map to integer IDs as the LDA model's training corpus, so no further mapping needs to | ||
be done. | ||
|
||
We also require a corpus of labels to say what time slice each document is in. These | ||
should be from the same set of labels that the DTM model was trained on, so that each | ||
document label can be mapped to a trained slice. | ||
|
||
Does not support Python 2 since Gensim has dropped Python 2 support. | ||
|
||
|
||
*This module does not support Python 2, so can only be used when Pimlico is being run under Python 3* | ||
|
||
Inputs | ||
====== | ||
|
||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
| Name | Type(s) | | ||
+========+=========================================================================================================================================================================+ | ||
| corpus | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`IntegerListsDocumentType <pimlico.datatypes.corpora.ints.IntegerListsDocumentType>`> | | ||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
| labels | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`LabelDocumentType <pimlico.datatypes.corpora.strings.LabelDocumentType>`> | | ||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
| model | :class:`ldaseq_model <pimlico.datatypes.gensim.GensimLdaSeqModel>` | | ||
+--------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
|
||
Outputs | ||
======= | ||
|
||
+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
| Name | Type(s) | | ||
+=========+===============================================================================================================================================================+ | ||
| vectors | :class:`grouped_corpus <pimlico.datatypes.corpora.grouped.GroupedCorpus>` <:class:`VectorDocumentType <pimlico.datatypes.corpora.floats.VectorDocumentType>`> | | ||
+---------+---------------------------------------------------------------------------------------------------------------------------------------------------------------+ | ||
|
||
Example config | ||
============== | ||
|
||
This is an example of how this module can be used in a pipeline config file. | ||
|
||
.. code-block:: ini | ||
[my_ldaseq_doc_topics_module] | ||
type=pimlico.modules.gensim.ldaseq_doc_topics | ||
input_corpus=module_a.some_output | ||
input_labels=module_a.some_output | ||
input_model=module_a.some_output | ||
Example pipelines | ||
================= | ||
|
||
This module is used by the following :ref:`example pipelines <example-pipelines>`. They are examples of how the module can be used together with other modules in a larger pipeline. | ||
|
||
* :ref:`example-pipeline-train-tms-example` | ||
|
||
Test pipelines | ||
============== | ||
|
||
This module is used by the following :ref:`test pipelines <test-pipelines>`. They are a further source of examples of the module's usage. | ||
|
||
* :ref:`test-config-gensim-dtm_infer.conf` | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
.. _test-config-gensim-dtm_infer.conf: | ||
|
||
dtm\_infer | ||
~~~~~~~~~~ | ||
|
||
|
||
|
||
This is one of the test pipelines included in Pimlico's repository. | ||
See :ref:`test-pipelines` for more details. | ||
|
||
Config file | ||
=========== | ||
|
||
The complete config file for this test pipeline: | ||
|
||
|
||
.. code-block:: ini | ||
# Take a trained DTM model and perform inference on other docs | ||
# | ||
# For a fuller example (on which this test is based), see | ||
# :doc:`the topic model training example </example_config/topic_modelling.train_tms>`. | ||
[pipeline] | ||
name=dtm_infer | ||
release=latest | ||
# Load word IDs | ||
[ids] | ||
type=pimlico.datatypes.corpora.GroupedCorpus | ||
data_point_type=IntegerListsDocumentType | ||
dir=%(test_data_dir)s/datasets/corpora/ids_ubuntu | ||
# Load slice labels | ||
[labels] | ||
type=pimlico.datatypes.corpora.GroupedCorpus | ||
data_point_type=LabelDocumentType | ||
dir=%(test_data_dir)s/datasets/corpora/labels_ubuntu | ||
# Load a trained DTM model | ||
[dtm] | ||
type=pimlico.datatypes.gensim.GensimLdaSeqModel | ||
dir=%(test_data_dir)s/datasets/dtm_model | ||
# Apply stationary DTM inference to all of the documents | ||
# This doesn't need to be run on the same document set we trained on: | ||
# we do that here just as an example | ||
[dtm_infer] | ||
type=pimlico.modules.gensim.ldaseq_doc_topics | ||
input_corpus=ids | ||
input_labels=labels | ||
input_model=dtm | ||
Modules | ||
======= | ||
|
||
|
||
The following Pimlico module types are used in this pipeline: | ||
|
||
* :mod:`pimlico.modules.gensim.ldaseq_doc_topics` | ||
|
||
|
Oops, something went wrong.