Merge remote-tracking branch 'origin/master'

# Conflicts: # README.rst # persephone/experiment.py # persephone/model.py
persephone-tools · Mar 31, 2018 · feaacde · feaacde
2 parents 5f4ee1e + 8ca2bbb
commit feaacde
Show file tree

Hide file tree

Showing 22 changed files with 331 additions and 493 deletions.
diff --git a/README.rst b/README.rst
diff --git a/docs/api.rst b/docs/api.rst
@@ -1,19 +1,91 @@
-API
-===
+The Persephone API
+==================
+
+In this section we discuss the application program interface (API) exposed by
+Persephone. We begin with descriptions of the fundamental classes included in
+the tool. Model training pipelines are described by instantiating these
+classes. Consider the following example for a preliminary look at how this
+works::
+
+   # Create a corpus from data that has already been preprocessed.
+   # Among other things, this will divide the corpus into training,
+   # validation and test sets.
+   from persephone.corpus import Corpus, ReadyCorpus
+   corpus = ReadyCorpus(tgt_dir="/path/to/preprocessed/data",
+                        feat_type="fbank",
+                        label_type="phonemes")
+
+   # Create an object that reads the corpus data in batches.
+   from persephone.corpus_reader import CorpusReader
+   corpus_reader = CorpusReader(corpus, batch_size=64)
+
+   # Create a neural network model (LSTM/CTC model) and train
+   # it on the corpus.
+   from persephone.rnn_ctc import Model
+   model = Model("/path/to/experiment/directory",
+                 corpus_reader,
+                 num_layers=3,
+                 num_hidden=250)
+   model.train()
+
+This will train and evaluate a model, storing information related to the
+specific experiment in `/path/to/experiment/directory`.
+
+In the next section we take a closer look at the classes that comprise this
+example, and reveal additional functionality, such as loading the 
+speech and transcriptions from `ELAN
+<https://tla.mpi.nl/tools/tla-tools/elan/>`_ files and how preprocessing of the
+raw transcription text is specified.
+
+On the horizon, but still to be implemented, is description of these pipelines and
+interaction between classes in a way that is compatible with the YAML files of
+the `eXtensible Neural Machine Translation toolkit (XNMT)
+<https://github.com/neulab/xnmt>`_.
 
 Fundamental classes
 -------------------
 
+The four key classes are the `Utterance`, `Corpus`, `CorpusReader`, and `Model`
+classes. `Utterance` instances comprise `Corpus` instances, which are loaded by
+`CorpusReader` instances and fed into `Model` instances.
+
 .. autoclass:: persephone.utterance.Utterance
+
 .. autoclass:: persephone.corpus.Corpus
-   :members:
+   :members: __init__, from_elan
+
+.. There is support for creating Corpus objects from ELAN files::
+
+..   # Create a corpus from ELAN input files.
+..   from persephone.corpus import Corpus
+..   corpus = Corpus.from_elan(org_dir="/path/to/input/data",
+..                             tgt_dir="/path/to/preprocessed/data",
+..                             utterance_filter=function_to_call,
+..                             label_segmenter=something,
+..                             tier_prefixes=("xv", "rf"))
+
+.. autoclass:: persephone.corpus.ReadyCorpus
+   :members: __init__, determine_labels
+
+.. autoclass:: persephone.corpus_reader.CorpusReader
+   :members: __init__, 
+
+.. autoclass:: persephone.model.Model
+   :members: __init__, train, transcribe
 
 Preprocessing
 -------------
 
 .. autofunction:: persephone.preprocess.elan.utterances_from_dir
+.. autoclass:: persephone.preprocess.labels.LabelSegmenter
 .. autofunction:: persephone.preprocess.wav.extract_wavs
 
+Models
+------
+
+.. autoclass:: persephone.rnn_ctc.Model
+   :members:
+
 Distance measurements
 ---------------------
 

diff --git a/docs/doc-requirements.txt b/docs/doc-requirements.txt
@@ -1,2 +1,3 @@
 sphinx-autodoc-typehints==1.2.5
 sphinx_rtd_theme
+persephone
diff --git a/docs/logging.ini b/docs/logging.ini
diff --git a/docs/quickstart.rst b/docs/quickstart.rst
@@ -51,7 +51,7 @@ Ensure Python 3 is installed.
 
 You will also need to install some system dependencies. For your
 convienence we have an install script for dependencies for Ubuntu. To
-install the Ubuntu binaries, run ``./bootstrap_ubuntu.sh`` to install
+install the Ubuntu binaries, run ``./ubuntu_bootstrap.sh`` to install
 ffmpeg packages. On MacOS we suggest installing via Homebrew with
 ``brew install ffmpeg``.
 
@@ -177,34 +177,24 @@ recognition systems. Providing a simple and flexible interface to your
 data is currently the most important priority for Persephone at the
 moment. This is a work in progress.
 
-Current data formatting requirements: \* Audio files are stored in
-``<your-corpus>/wav/``. The WAV format is supported. Persephone will
-automatically convert wavs to be 16bit mono 16000Hz. \* Transcriptions
-are stored in text files in ``<your-corpus>/label/`` \* Each audio file
-is short (ideally no longer than 10 seconds). There is a script added by
-Ben Foley, ``persephone/scripts/split_eafs.py``, to split audio files
-into utterance-length units based on ELAN input files. \* Each audio
-file in ``wav/`` has a corresponding transcription file in ``label/``
-with the same *prefix* (the bit of the filename before the extension).
-For example, if there is ``wav/utterance_one.wav`` then there should be
-``label/utterance_one.<extension>``. ``<extension>`` can be whatever you
-want, but it should describe how the labelling is done. For example, if
-it is phonemic then ``wav/utterance_one.phonemes`` is a meaningful
-filename. \* Each transcript file includes a space-delimited list of
-*labels* to the model should learn to transcribe. For example: \*
-``data/na_example/label/crdo-NRU_F4_ACCOMP_PFV.0.phonemes`` contains
-``l e dz ɯ z e l e dz ɯ z e`` \*
-``data/na_example/label/crdo-NRU_F4_ACCOMP_PFV.0.phonemes_and_tones``
-might contain: ``l e ˧ dz ɯ ˥ z e ˩ | l e ˧ dz ɯ ˥ z e ˩`` \* Persephone
-is agnostic to what your chosen labels are. It simply tries to figure
-out how to map speech to that labelling. These labels can be multiple
-characters long: the spaces demarcate labels. Labels can be any unicode
-character(s). \* Spaces are used to delimit the units that the tool
-predicts. Typically these units are phonemes or tones, however they
-could also just be orthographic characters (though performance is likely
-to be a bit lower: consider trying to transcribe "$100"). The model
-can't tell the difference between digraphs and unigraphs as long as
-they're tokenized in this format, demarcated with spaces.
+Current data formatting requirements:
+
+* Audio files are stored in ``<your-corpus>/wav/``. The WAV format is supported. Persephone will automatically convert wavs to be 16bit mono 16000Hz.
+
+* Transcriptions are stored in text files in ``<your-corpus>/label/``
+
+* Each audio file is short (ideally no longer than 10 seconds). There is a script added by Ben Foley, ``persephone/scripts/split_eafs.py``, to split audio files into utterance-length units based on ELAN input files. 
+
+* Each audio file in ``wav/`` has a corresponding transcription file in ``label/`` with the same *prefix* (the bit of the filename before the extension). For example, if there is ``wav/utterance_one.wav`` then there should be ``label/utterance_one.<extension>``. ``<extension>`` can be whatever you want, but it should describe how the labelling is done. For example, if it is phonemic then ``wav/utterance_one.phonemes`` is a meaningful filename.
+
+* Each transcript file includes a space-delimited list of *labels* to the model should learn to transcribe. For example:
+
+  - ``data/na_example/label/crdo-NRU_F4_ACCOMP_PFV.0.phonemes`` contains ``l e dz ɯ z e l e dz ɯ z e``
+  - ``data/na_example/label/crdo-NRU_F4_ACCOMP_PFV.0.phonemes_and_tones`` might contain: ``l e ˧ dz ɯ ˥ z e ˩ | l e ˧ dz ɯ ˥ z e ˩``
+
+* Persephone is agnostic to what your chosen labels are. It simply tries to figure out how to map speech to that labelling. These labels can be multiple characters long: the spaces demarcate labels. Labels can be any unicode character(s).
+
+* Spaces are used to delimit the units that the tool predicts. Typically these units are phonemes or tones, however they could also just be orthographic characters (though performance is likely to be a bit lower: consider trying to transcribe "$100"). The model can't tell the difference between digraphs and unigraphs as long as they're tokenized in this format, demarcated with spaces.
 
 If your data observes this format then you can load it via the
 ``ReadyCorpus`` class. If your data does not observe this format, you
@@ -248,11 +238,15 @@ the available utterances in neither of these text files.
 On choosing an appropriate label granularity
 ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
 
-Question: Suprasegmentals like tone, glottalizzation, nasalization, and
-length are all phonemic in the language I am using. Do they belong in
-one grouping or separately?
+Question:
+
+    Suprasegmentals like tone, glottalization, nasalization, and
+    length are all phonemic in the language I am using. Do they belong in
+    one grouping or separately?
+
+Answer:
 
-Answer: I'm wary of making sweeping claims about the best approach to
+I'm wary of making sweeping claims about the best approach to
 handle all these sorts of phenomena that will realise themselves
 differently between languages, since I'm neither a linguist nor do I
 have strong understanding for what features the model will learn each

diff --git a/docs/settings.ini b/docs/settings.ini
diff --git a/persephone/__init__.py b/persephone/__init__.py
@@ -1 +1,15 @@
 __version__ = "0.2.0"
+
+import sys
+import logging
+
+def handle_unhandled_exception(exc_type, exc_value, exc_traceback):
+    """Handler for unhandled exceptions that will write to the logs"""
+    if issubclass(exc_type, KeyboardInterrupt):
+        # call the default excepthook saved at __excepthook__
+        sys.__excepthook__(exc_type, exc_value, exc_traceback)
+        return
+    logger = logging.getLogger(__name__) # type: ignore
+    logger.critical("Unhandled exception", exc_info=(exc_type, exc_value, exc_traceback))
+
+sys.excepthook = handle_unhandled_exception
diff --git a/persephone/config.py b/persephone/config.py
@@ -10,7 +10,7 @@
 """
 import configparser
 import os
-from pathlib import Path
+from pkg_resources import Requirement, resource_filename
 
 config_file = configparser.ConfigParser()
 config_file.read('settings.ini')
@@ -23,9 +23,9 @@
 
 # For Kunwinjku data:
 BKW_PATH = config_file.get("PATHS", "BKW_PATH",
-    fallback=os.path.join(CORPORA_BASE_PATH, "BKW-speaker-ids_2/"))
+                           fallback=os.path.join(CORPORA_BASE_PATH, "BKW-speaker-ids_2/"))
 EN_WORDS_PATH = config_file.get("PATHS", "EN_WORDS_PATH",
-    fallback=os.path.join(CORPORA_BASE_PATH, "english-words/words.txt"))
+                                fallback=os.path.join(CORPORA_BASE_PATH, "english-words/words.txt"))
 
 # The directory where the preprocessed data will be held.
 TGT_DIR = config_file.get("PATHS", "TARGET", fallback="./data")
@@ -48,7 +48,6 @@
 OPENFST_BIN_PATH = config_file.get("PATHS", "OPEN_FST_BIN_PATH", fallback="/home/oadams/tools/openfst-1.6.2/src/bin")
 
 # Fetch the path of the logging.ini file installed by setuptools.
-from pkg_resources import Requirement, resource_filename
 logging_ini_path = resource_filename(Requirement.parse("persephone"), "persephone/logging.ini")
 
 LOGGING_INI_PATH = config_file.get("PATHS", "log_ini_path", fallback=logging_ini_path)
diff --git a/persephone/context_manager.py b/persephone/context_manager.py
@@ -2,14 +2,15 @@
 
 import os
 
+
 class cd:
     """Context manager for changing the current working directory"""
-    def __init__(self, newPath):
-        self.newPath = os.path.expanduser(newPath)
+    def __init__(self, new_path):
+        self.new_path = os.path.expanduser(new_path)
 
     def __enter__(self):
-        self.savedPath = os.getcwd()
-        os.chdir(self.newPath)
+        self.saved_path = os.getcwd()
+        os.chdir(self.new_path)
 
     def __exit__(self, etype, value, traceback):
-        os.chdir(self.savedPath)
+        os.chdir(self.saved_path)