Rebuilt docs

markgw · Aug 5, 2019 · c28ba18 · c28ba18
1 parent 4adbf4e
commit c28ba18
Show file tree

Hide file tree

Showing 19 changed files with 229 additions and 102 deletions.
diff --git a/docs/modules/pimlico.modules.corpora.group.rst b/docs/modules/pimlico.modules.corpora.group.rst
@@ -56,10 +56,10 @@ Options
 +------------------+------------------------------------------------------------------------------------------------------+--------+
 | Name             | Description                                                                                          | Type   |
 +==================+======================================================================================================+========+
-| archive_size     | Number of documents to include in each archive (default: 1k)                                         | int    |
-+------------------+------------------------------------------------------------------------------------------------------+--------+
 | archive_basename | Base name to use for archive tar files. The archive number is appended to this. (Default: 'archive') | string |
 +------------------+------------------------------------------------------------------------------------------------------+--------+
+| archive_size     | Number of documents to include in each archive (default: 1k)                                         | int    |
++------------------+------------------------------------------------------------------------------------------------------+--------+
 
 Example config
 ==============
@@ -80,8 +80,8 @@ This example usage includes more options.
    [my_group_module]
    type=pimlico.modules.corpora.group
    input_documents=module_a.some_output
-   archive_size=1000
    archive_basename=archive
+   archive_size=1000
 
 Test pipelines
 ==============

diff --git a/docs/modules/pimlico.modules.corpora.interleave.rst b/docs/modules/pimlico.modules.corpora.interleave.rst
@@ -49,10 +49,10 @@ Options
 +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------+
 | Name             | Description                                                                                                                                     | Type   |
 +==================+=================================================================================================================================================+========+
-| archive_size     | Documents are regrouped into new archives. Number of documents to include in each archive (default: 1k)                                         | string |
-+------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------+
 | archive_basename | Documents are regrouped into new archives. Base name to use for archive tar files. The archive number is appended to this. (Default: 'archive') | string |
 +------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+| archive_size     | Documents are regrouped into new archives. Number of documents to include in each archive (default: 1k)                                         | string |
++------------------+-------------------------------------------------------------------------------------------------------------------------------------------------+--------+
 
 Example config
 ==============
@@ -73,8 +73,8 @@ This example usage includes more options.
    [my_interleave_module]
    type=pimlico.modules.corpora.interleave
    input_corpora=module_a.some_output
-   archive_size=1000
    archive_basename=archive
+   archive_size=1000
 
 Test pipelines
 ==============

diff --git a/docs/modules/pimlico.modules.corpora.shuffle.rst b/docs/modules/pimlico.modules.corpora.shuffle.rst
@@ -54,10 +54,10 @@ Options
 +====================+============================================================================================================================================================================================================================================================================================================================+========+
 | archive_basename   | Basename to use for archives in the output corpus. Default: 'archive'                                                                                                                                                                                                                                                      | string |
 +--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
-| keep_archive_names | By default, it is assumed that all doc names are unique to the whole corpus, so the same doc names are used once the documents are put into their new archives. If doc names are only unique within the input archives, use this and the input archive names will be included in the output document names. Default: False | bool   |
-+--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
 | bin_size           | Target expected size of temporary bins into which documents are shuffled. The actual size may vary, but they will on average have this size. Default: 100                                                                                                                                                                  | int    |
 +--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
+| keep_archive_names | By default, it is assumed that all doc names are unique to the whole corpus, so the same doc names are used once the documents are put into their new archives. If doc names are only unique within the input archives, use this and the input archive names will be included in the output document names. Default: False | bool   |
++--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
 | num_bins           | Directly set the number of temporary bins to put document into. If set, bin_size is ignored                                                                                                                                                                                                                                | int    |
 +--------------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------+
 
@@ -81,8 +81,8 @@ This example usage includes more options.
    type=pimlico.modules.corpora.shuffle
    input_corpus=module_a.some_output
    archive_basename=archive
-   keep_archive_names=F
    bin_size=100
+   keep_archive_names=F
    num_bins=0
 
 Test pipelines

diff --git a/docs/modules/pimlico.modules.corpora.subset.rst b/docs/modules/pimlico.modules.corpora.subset.rst
@@ -47,10 +47,10 @@ Options
 +--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
 | Name         | Description                                                                                                                                                                                                                                  | Type |
 +==============+==============================================================================================================================================================================================================================================+======+
-| size         | (required) Number of documents to include                                                                                                                                                                                                    | int  |
-+--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
 | offset       | Number of documents to skip at the beginning of the corpus (default: 0, start at beginning)                                                                                                                                                  | int  |
 +--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
+| size         | (required) Number of documents to include                                                                                                                                                                                                    | int  |
++--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
 | skip_invalid | Skip over any invalid documents so that the output subset contains the chosen number of (valid) documents (or as many as possible) and no invalid ones. By default, invalid documents are passed through and counted towards the subset size | bool |
 +--------------+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------+
 
@@ -73,8 +73,8 @@ This example usage includes more options.
    [my_subset_module]
    type=pimlico.modules.corpora.subset
    input_corpus=module_a.some_output
-   size=100
    offset=0
+   size=100
    skip_invalid=T
 
 Test pipelines

diff --git a/docs/modules/pimlico.modules.corpora.vocab_builder.rst b/docs/modules/pimlico.modules.corpora.vocab_builder.rst
@@ -40,17 +40,17 @@ Options
 +-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
 | Name      | Description                                                                                                                                                                                                                                                                                                                                                  | Type                            |
 +===========+==============================================================================================================================================================================================================================================================================================================================================================+=================================+
+| include   | Ensure that certain words are always included in the vocabulary, even if they don't make it past the various filters, or are never seen in the corpus. Give as a comma-separated list                                                                                                                                                                        | comma-separated list of strings |
++-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
 | limit     | Limit vocab size to this number of most common entries (after other filters)                                                                                                                                                                                                                                                                                 | int                             |
 +-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
-| threshold | Minimum number of occurrences required of a term to be included                                                                                                                                                                                                                                                                                              | int                             |
+| max_prop  | Include terms that occur in max this proportion of documents                                                                                                                                                                                                                                                                                                 | float                           |
 +-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
 | oov       | Use the final index the represent chars that will be out of vocabulary after applying threshold/limit filters. Applied even if the count is 0. Represent OOVs using the given string in the vocabulary                                                                                                                                                       | string                          |
 +-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
 | prune_at  | Prune the dictionary if it reaches this size. Setting a lower value avoids getting stuck with too big a dictionary to be able to prune and slowing things down, but means that the final pruning will less accurately reflect the true corpus stats. Should be considerably higher than limit (if used). Set to 0 to disable. Default: 2M (Gensim's default) | int                             |
 +-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
-| include   | Ensure that certain words are always included in the vocabulary, even if they don't make it past the various filters, or are never seen in the corpus. Give as a comma-separated list                                                                                                                                                                        | comma-separated list of strings |
-+-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
-| max_prop  | Include terms that occur in max this proportion of documents                                                                                                                                                                                                                                                                                                 | float                           |
+| threshold | Minimum number of occurrences required of a term to be included                                                                                                                                                                                                                                                                                              | int                             |
 +-----------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------+
 
 Example config
@@ -72,11 +72,11 @@ This example usage includes more options.
    [my_vocab_builder_module]
    type=pimlico.modules.corpora.vocab_builder
    input_text=module_a.some_output
+   include=word1,word2,... 
    limit=10k
-   threshold=100
    oov=value
    prune_at=2000000
-   include=word1,word2,... 
+   threshold=100
 
 Test pipelines
 ==============