Skip to content

Commit

Permalink
Added documentation
Browse files Browse the repository at this point in the history
  • Loading branch information
kdavis-mozilla committed Dec 10, 2018
1 parent 7dc160c commit 6782024
Show file tree
Hide file tree
Showing 20 changed files with 197 additions and 53 deletions.
18 changes: 0 additions & 18 deletions docs/index.rst
Expand Up @@ -4,24 +4,6 @@ CorporaCreator

This is the documentation of **CorporaCreator**.

.. note::

This is the main page of your project's `Sphinx`_ documentation.
It is formatted in `reStructuredText`_. Add additional pages
by creating rst-files in ``docs`` and adding them to the `toctree`_ below.
Use then `references`_ in order to link them from this page, e.g.
:ref:`authors` and :ref:`changes`.

It is also possible to refer to the documentation of other Python packages
with the `Python domain syntax`_. By default you can reference the
documentation of `Sphinx`_, `Python`_, `NumPy`_, `SciPy`_, `matplotlib`_,
`Pandas`_, `Scikit-Learn`_. You can add more by extending the
``intersphinx_mapping`` in your Sphinx's ``conf.py``.

The pretty useful extension `autodoc`_ is activated by default and lets
you include documentation from docstrings. Docstrings can be written in
`Google style`_ (recommended!), `NumPy style`_ and `classical style`_.


Contents
========
Expand Down
2 changes: 1 addition & 1 deletion src/corporacreator/argparse.py
Expand Up @@ -28,7 +28,7 @@ def parse_args(args):
"""Parse command line parameters
Args:
args ([str]): command line parameters as list of strings
args ([str]): Command line parameters as list of strings
Returns:
:obj:`argparse.Namespace`: command line parameters namespace
Expand Down
16 changes: 16 additions & 0 deletions src/corporacreator/corpora.py
Expand Up @@ -11,11 +11,22 @@


class Corpora:
"""Corpora representing all Common Voice datasets.
Args:
args ([str]): Command line parameters as list of strings
Attributes:
args ([str]): command line parameters as list of strings
corpora ([:class:`corporacreator.Corpus`]): List of :class:`corporacreator.Corpus` instances
"""
def __init__(self, args):
self.args = args
self.corpora = []

def create(self):
"""Creates a :class:`corporacreator.Corpus` for each locale.
"""
_logger.info("Creating corpora...")
corpora_data = self._parse_tsv()
for locale in corpora_data.locale.unique():
Expand Down Expand Up @@ -56,6 +67,11 @@ def _parse_tsv(self):
return corpora_data

def save(self, directory):
"""Saves this :class:`corporacreator.Corpora` in `directory`.
Args:
directory (str): Directory into which this `corporacreator.Corpora` is saved.
"""
if not os.path.exists(directory):
os.mkdir(directory)
_logger.info("Saving corpora...")
Expand Down
20 changes: 19 additions & 1 deletion src/corporacreator/corpus.py
Expand Up @@ -10,17 +10,30 @@
_logger = logging.getLogger(__name__)

class Corpus:
"""Corpus representing a Common Voice datasets for a given locale.
Args:
args ([str]): Command line parameters as list of strings
locale (str): Locale this :class:`corporacreator.Corpus` represents
corpus_data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the corpus data
Attributes:
args ([str]): Command line parameters as list of strings
locale (str): Locale of this :class:`corporacreator.Corpus`
corpus_data (:class:`pandas.DataFrame`): `pandas.DataFrame` Containing the corpus data
"""
def __init__(self, args, locale, corpus_data):
self.args = args
self.locale = locale
self.corpus_data = corpus_data

def create(self):
"""Creates a :class:`corporacreator.Corpus` for `self.locale`.
"""
_logger.debug("Creating %s corpus..." % self.locale)
self._pre_process_corpus_data()
self._partition_corpus_data()
self._post_process_valid_data()
# Do it here....
_logger.debug("Created %s corpora." % self.locale)

def _pre_process_corpus_data(self):
Expand Down Expand Up @@ -74,6 +87,11 @@ def _calculate_data_set_sizes(self, total_size):
return train_size, dev_size, test_size

def save(self, directory):
"""Saves this :class:`corporacreator.Corpus` in `directory`.
Args:
directory (str): Directory into which this `corporacreator.Corpus` is saved.
"""
directory = os.path.join(directory, self.locale)
if not os.path.exists(directory):
os.mkdir(directory)
Expand Down
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/br.py
@@ -1,5 +1,13 @@
import pandas as pd

def br(corpus_data):
def br(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up br data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/ca.py
@@ -1,5 +1,13 @@
import pandas as pd

def ca(corpus_data):
def ca(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up ca data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/cv.py
@@ -1,5 +1,13 @@
import pandas as pd

def cv(corpus_data):
def cv(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up cv data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/cy.py
@@ -1,5 +1,13 @@
import pandas as pd

def cy(corpus_data):
def cy(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up cy data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/de.py
@@ -1,5 +1,13 @@
import pandas as pd

def de(corpus_data):
def de(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up de data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/en.py
@@ -1,5 +1,13 @@
import pandas as pd

def en(corpus_data):
def en(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up en data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/fr.py
@@ -1,5 +1,13 @@
import pandas as pd

def fr(corpus_data):
def fr(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up fr data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/gaIE.py
@@ -1,5 +1,13 @@
import pandas as pd

def gaIE(corpus_data):
def gaIE(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up ga-IE data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/it.py
@@ -1,5 +1,13 @@
import pandas as pd

def it(corpus_data):
def it(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up it data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/kab.py
@@ -1,5 +1,13 @@
import pandas as pd

def kab(corpus_data):
def kab(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up kab data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/ky.py
@@ -1,5 +1,13 @@
import pandas as pd

def ky(corpus_data):
def ky(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up ky data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/sl.py
@@ -1,5 +1,13 @@
import pandas as pd

def sl(corpus_data):
def sl(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up sl data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/tr.py
@@ -1,5 +1,13 @@
import pandas as pd

def tr(corpus_data):
def tr(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up tr data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/tt.py
@@ -1,5 +1,13 @@
import pandas as pd

def tt(corpus_data):
def tt(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up tt data
return corpus_data
return sentence
12 changes: 10 additions & 2 deletions src/corporacreator/preprocessors/zhTW.py
@@ -1,5 +1,13 @@
import pandas as pd

def zhTW(corpus_data):
def zhTW(sentence):
"""Cleans up the passed sentence, removing or reformatting invalid data.
Args:
sentence (str): Sentence to be cleaned up.
Returns:
(str): Cleaned up sentence.
"""
# TODO: Clean up zh-TW data
return corpus_data
return sentence
14 changes: 11 additions & 3 deletions src/corporacreator/statistics.py
@@ -1,7 +1,15 @@
def sample_size(train_size):
z_score = 2.58 # Corresponds to confidence level 99%
def sample_size(population_size):
"""Calculates the sample size.
Calculates the sample size required to draw from a population size `population_size`
with a confidence level of 99% and a margin of error of 1%.
Args:
population_size (int): The population size to draw from.
"""
margin_of_error = 0.01
fraction_picking = 0.50
z_score = 2.58 # Corresponds to confidence level 99%
numerator = (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2)
denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2 * train_size)
denominator = 1 + (z_score**2 * fraction_picking * (1 - fraction_picking)) / (margin_of_error**2 * population_size)
return numerator / denominator

0 comments on commit 6782024

Please sign in to comment.