Added documentation

common-voice · Dec 10, 2018 · 6782024 · 6782024
1 parent 7dc160c
commit 6782024
Show file tree

Hide file tree

Showing 20 changed files with 197 additions and 53 deletions.
diff --git a/docs/index.rst b/docs/index.rst
@@ -4,24 +4,6 @@ CorporaCreator
 
 This is the documentation of **CorporaCreator**.
 
-.. note::
-
-    This is the main page of your project's `Sphinx`_ documentation.
-    It is formatted in `reStructuredText`_. Add additional pages
-    by creating rst-files in ``docs`` and adding them to the `toctree`_ below.
-    Use then `references`_ in order to link them from this page, e.g.
-    :ref:`authors` and :ref:`changes`.
-
-    It is also possible to refer to the documentation of other Python packages
-    with the `Python domain syntax`_. By default you can reference the
-    documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_,
-    `Pandas`_, `Scikit-Learn`_. You can add more by extending the
-    ``intersphinx_mapping`` in your Sphinx's ``conf.py``.
-
-    The pretty useful extension `autodoc`_ is activated by default and lets
-    you include documentation from docstrings. Docstrings can be written in
-    `Google style`_ (recommended!), `NumPy style`_ and `classical style`_.
-
 
 Contents
 ========

diff --git a/src/corporacreator/argparse.py b/src/corporacreator/argparse.py
@@ -28,7 +28,7 @@ def parse_args(args):
     """Parse command line parameters
 
     Args:
-      args ([str]): command line parameters as list of strings
+      args ([str]): Command line parameters as list of strings
 
     Returns:
       :obj:`argparse.Namespace`: command line parameters namespace

diff --git a/src/corporacreator/corpora.py b/src/corporacreator/corpora.py
@@ -11,11 +11,22 @@
 
 
 class Corpora:
+    """Corpora representing all Common Voice datasets.
+
+    Args:
+      args ([str]): Command line parameters as list of strings
+
+    Attributes:
+        args ([str]): command line parameters as list of strings
+        corpora ([:class:`corporacreator.Corpus`]): List of :class:`corporacreator.Corpus` instances
+    """
     def __init__(self, args):
         self.args = args
         self.corpora = []
 
     def create(self):
+        """Creates a :class:`corporacreator.Corpus` for each locale.
+        """
         _logger.info("Creating corpora...")
         corpora_data = self._parse_tsv()
         for locale in corpora_data.locale.unique():
@@ -56,6 +67,11 @@ def _parse_tsv(self):
         return corpora_data
 
     def save(self, directory):
+        """Saves this :class:`corporacreator.Corpora` in `directory`.
+
+        Args:
+          directory (str): Directory into which this `corporacreator.Corpora` is saved.
+        """
         if not os.path.exists(directory):
             os.mkdir(directory)
         _logger.info("Saving corpora...")

diff --git a/src/corporacreator/corpus.py b/src/corporacreator/corpus.py
@@ -10,17 +10,30 @@
 _logger = logging.getLogger(__name__)
 
 class Corpus:
+    """Corpus representing a Common Voice datasets for a given locale.
+
+    Args:
+      args ([str]): Command line parameters as list of strings
+      locale (str): Locale this :class:`corporacreator.Corpus` represents
+      corpus_data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the corpus data
+
+    Attributes:
+        args ([str]): Command line parameters as list of strings
+        locale (str): Locale of this :class:`corporacreator.Corpus` 
+        corpus_data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the corpus data
+    """
     def __init__(self, args, locale, corpus_data):
         self.args = args
         self.locale = locale
         self.corpus_data = corpus_data
 
     def create(self):
+        """Creates a :class:`corporacreator.Corpus` for `self.locale`.
+        """
         _logger.debug("Creating %s corpus..." % self.locale)
         self._pre_process_corpus_data()
         self._partition_corpus_data()
         self._post_process_valid_data()
-        # Do it here....
         _logger.debug("Created %s corpora." % self.locale)
 
     def _pre_process_corpus_data(self):
@@ -74,6 +87,11 @@ def _calculate_data_set_sizes(self, total_size):
         return train_size, dev_size, test_size
 
     def save(self, directory):
+        """Saves this :class:`corporacreator.Corpus` in `directory`.
+
+        Args:
+          directory (str): Directory into which this `corporacreator.Corpus` is saved.
+        """
         directory = os.path.join(directory, self.locale)
         if not os.path.exists(directory):
             os.mkdir(directory)

diff --git a/src/corporacreator/preprocessors/br.py b/src/corporacreator/preprocessors/br.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def br(corpus_data):
+def br(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up br data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/ca.py b/src/corporacreator/preprocessors/ca.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def ca(corpus_data):
+def ca(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up ca data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/cv.py b/src/corporacreator/preprocessors/cv.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def cv(corpus_data):
+def cv(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up cv data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/cy.py b/src/corporacreator/preprocessors/cy.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def cy(corpus_data):
+def cy(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up cy data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/de.py b/src/corporacreator/preprocessors/de.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def de(corpus_data):
+def de(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up de data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/en.py b/src/corporacreator/preprocessors/en.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def en(corpus_data):
+def en(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up en data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/fr.py b/src/corporacreator/preprocessors/fr.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def fr(corpus_data):
+def fr(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up fr data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/gaIE.py b/src/corporacreator/preprocessors/gaIE.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def gaIE(corpus_data):
+def gaIE(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up ga-IE data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/it.py b/src/corporacreator/preprocessors/it.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def it(corpus_data):
+def it(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up it data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/kab.py b/src/corporacreator/preprocessors/kab.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def kab(corpus_data):
+def kab(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up kab data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/ky.py b/src/corporacreator/preprocessors/ky.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def ky(corpus_data):
+def ky(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up ky data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/sl.py b/src/corporacreator/preprocessors/sl.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def sl(corpus_data):
+def sl(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up sl data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/tr.py b/src/corporacreator/preprocessors/tr.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def tr(corpus_data):
+def tr(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up tr data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/tt.py b/src/corporacreator/preprocessors/tt.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def tt(corpus_data):
+def tt(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up tt data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/preprocessors/zhTW.py b/src/corporacreator/preprocessors/zhTW.py
@@ -1,5 +1,13 @@
 import pandas as pd
 
-def zhTW(corpus_data):
+def zhTW(sentence):
+    """Cleans up the passed sentence, removing or reformatting invalid data.
+
+    Args:
+      sentence (str): Sentence to be cleaned up.
+
+    Returns:
+      (str): Cleaned up sentence.
+    """
     # TODO: Clean up zh-TW data
-    return corpus_data
+    return sentence
diff --git a/src/corporacreator/statistics.py b/src/corporacreator/statistics.py
@@ -1,7 +1,15 @@
-def sample_size(train_size):
-    z_score = 2.58 # Corresponds to confidence level 99%
+def sample_size(population_size):
+    """Calculates the sample size.
+
+    Calculates the sample size required to draw from a population size `population_size`
+    with a confidence level of 99% and a margin of error of 1%.
+
+    Args:
+      population_size (int): The population size to draw from.
+    """
     margin_of_error = 0.01
     fraction_picking = 0.50
+    z_score = 2.58 # Corresponds to confidence level 99%
     numerator = (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2)
-    denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2 * train_size)
+    denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2 * population_size)
     return numerator / denominator