Merge branch 'master' into bugfix

persephone-tools · Jul 21, 2018 · 9f12ec6 · 9f12ec6
2 parents dc184cf + 4248eb9
commit 9f12ec6
Show file tree

Hide file tree

Showing 10 changed files with 474 additions and 419 deletions.
diff --git a/changelog.md b/changelog.md
@@ -0,0 +1,26 @@
+# Changelog
+All notable changes to this project will be documented in this file.
+
+The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
+and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).
+
+## [Unreleased]
+
+### Added
+- Changelog
+
+## [0.3.1] - 2018-07-14
+
+### Fixed
+- Documentation for tutorial running
+- Pathlib handling for parameters
+
+## [0.3.0] - 2018-07-14
+
+### Added
+- More mypy type annotations
+- More test coverage
+
+### Removed
+- Removed `ReadyCorpus` in PR #163 (https://github.com/persephone-tools/persephone/pull/163)
+
diff --git a/persephone/corpus.py b/persephone/corpus.py
@@ -263,7 +263,8 @@ def from_elan(cls: Type[CorpusT], org_dir: Path, tgt_dir: Path,
             raise ValueError("A label segmenter must be provided via label_segmenter")
 
         # In case path is supplied as a string, make it a Path
-        self.tgt_dir = Path(tgt_dir)
+        if isinstance(tgt_dir, str):
+            tgt_dir = Path(tgt_dir)
 
         # Read utterances from org_dir.
         utterances = elan.utterances_from_dir(org_dir,

diff --git a/persephone/experiment.py b/persephone/experiment.py
@@ -5,21 +5,23 @@
 import git
 from git import Repo
 
+from typing import Optional
+
 import persephone
 from . import config
 from . import rnn_ctc
 from .corpus_reader import CorpusReader
 from .utils import is_git_directory_clean
 
-EXP_DIR = config.EXP_DIR
+EXP_DIR = config.EXP_DIR # type: str
 
-def get_exp_dir_num(parent_dir):
+def get_exp_dir_num(parent_dir: str) -> int:
     """ Gets the number of the current experiment directory."""
     return max([int(fn.split(".")[0])
                 for fn in os.listdir(parent_dir) if fn.split(".")[0].isdigit()]
                     + [-1])
 
-def _prepare_directory(directory_path):
+def _prepare_directory(directory_path: str) -> str:
     """
     Prepare the directory structure required for the experiment
     :returns: returns the name of the newly created directory
@@ -31,7 +33,7 @@ def _prepare_directory(directory_path):
         os.makedirs(exp_dir)
     return exp_dir
 
-def prep_sub_exp_dir(parent_dir):
+def prep_sub_exp_dir(parent_dir: str) -> str:
     """ Prepares an experiment subdirectory
     :parent_dir: the parent directory
     :returns: returns the name of the newly created subdirectory

diff --git a/persephone/model.py b/persephone/model.py
@@ -6,14 +6,15 @@
 import os
 from pathlib import Path
 import sys
-from typing import Union, Sequence, Set, List
+from typing import Optional, Union, Sequence, Set, List
 
 import tensorflow as tf
 
 from .preprocess import labels
 from . import utils
 from . import config
 from .exceptions import PersephoneException
+from .corpus_reader import CorpusReader
 
 OPENFST_PATH = config.OPENFST_BIN_PATH
 
@@ -103,8 +104,11 @@ class Model:
         saved_model_path: Path to where the Tensorflow model is being saved on disk.
     """
 
-    def __init__(self, exp_dir, corpus_reader) -> None:
-        self.exp_dir = exp_dir
+    def __init__(self, exp_dir: Union[Path, str], corpus_reader: CorpusReader) -> None:
+        if isinstance(exp_dir, Path):
+            self.exp_dir = str(exp_dir) # type: str
+        else:
+            self.exp_dir = exp_dir # type: str
         self.corpus_reader = corpus_reader
         self.log_softmax = None
         self.batch_x = None
@@ -114,9 +118,9 @@ def __init__(self, exp_dir, corpus_reader) -> None:
         self.ler = None
         self.dense_decoded = None
         self.dense_ref = None
-        self.saved_model_path = None
+        self.saved_model_path = "" # type: str
 
-    def transcribe(self, restore_model_path=None) -> None:
+    def transcribe(self, restore_model_path: Optional[str]=None) -> None:
         """ Transcribes an untranscribed dataset. Similar to eval() except
         no reference translation is assumed, thus no LER is calculated.
         """
@@ -158,7 +162,7 @@ def transcribe(self, restore_model_path=None) -> None:
                         print(" ".join(hyp), file=hyps_f)
                         print("", file=hyps_f)
 
-    def eval(self, restore_model_path=None) -> None:
+    def eval(self, restore_model_path: Optional[str]=None) -> None:
         """ Evaluates the model on a test set."""
 
         saver = tf.train.Saver()
@@ -197,15 +201,15 @@ def eval(self, restore_model_path=None) -> None:
             with open(os.path.join(hyps_dir, "test_per"), "w") as per_f:
                 print("Test PER: %f, tf LER: %f" % (test_per, test_ler), file=per_f)
 
-    def output_best_scores(self, best_epoch_str):
+    def output_best_scores(self, best_epoch_str: str) -> None:
         """Output best scores to the filesystem"""
         BEST_SCORES_FILENAME = "best_scores.txt"
         with open(os.path.join(self.exp_dir, BEST_SCORES_FILENAME), "w") as best_f:
             print(best_epoch_str, file=best_f, flush=True)
 
     def train(self, early_stopping_steps: int = 10, min_epochs: int = 30,
               max_valid_ler: float = 1.0, max_train_ler: float = 0.3,
-              max_epochs: int = 100, restore_model_path=None) -> None:
+              max_epochs: int = 100, restore_model_path: Optional[str]=None) -> None:
         """ Train the model.
 
             min_epochs: minimum number of epochs to run training for.

diff --git a/persephone/rnn_ctc.py b/persephone/rnn_ctc.py
@@ -23,7 +23,7 @@ def write_desc(self) -> None:
             for key, val in self.__dict__.items():
                 print("%s=%s" % (key, val), file=desc_f)
 
-    def __init__(self, exp_dir, corpus_reader, num_layers: int = 3,
+    def __init__(self, exp_dir: str, corpus_reader, num_layers: int = 3,
                  hidden_size: int=250, beam_width: int = 100,
                  decoding_merge_repeated: bool = True) -> None:
         super().__init__(exp_dir, corpus_reader)
@@ -71,15 +71,15 @@ def __init__(self, exp_dir, corpus_reader, num_layers: int = 3,
                 # For feeding into the next layer
                 layer_input = self.outputs_concat
 
-        self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size*2]) #type: ignore
+        self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size*2]) # pylint: disable=no-member
 
         # Single-variable names are appropriate for weights an biases.
         # pylint: disable=invalid-name
         W = tf.Variable(tf.truncated_normal([hidden_size*2, vocab_size],
                 stddev=np.sqrt(2.0 / (2*hidden_size)))) #type: ignore
         b = tf.Variable(tf.zeros([vocab_size])) #type: ignore
         self.logits = tf.matmul(self.outputs, W) + b #type: ignore
-        self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size]) #type: ignore
+        self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size]) # pylint: disable=no-member
         # igormq made it time major, because of an optimization in ctc_loss.
         self.logits = tf.transpose(self.logits, (1, 0, 2), name="logits") #type: ignore
 

diff --git a/stubs/numpy/LICENSE b/stubs/numpy/LICENSE
@@ -0,0 +1,30 @@
+Copyright (c) 2005-2017, NumPy Developers.
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are
+met:
+
+    * Redistributions of source code must retain the above copyright
+       notice, this list of conditions and the following disclaimer.
+
+    * Redistributions in binary form must reproduce the above
+       copyright notice, this list of conditions and the following
+       disclaimer in the documentation and/or other materials provided
+       with the distribution.
+
+    * Neither the name of the NumPy Developers nor the names of any
+       contributors may be used to endorse or promote products derived
+       from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+"AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.