Merge pull request #147 from customprogrammingsolutions/refactor-model

[MRG] Clean up Model
persephone-tools · May 28, 2018 · cfe5096 · cfe5096
2 parents aa69575 + 4da960c
commit cfe5096
Show file tree

Hide file tree

Showing 13 changed files with 383 additions and 196 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -6,6 +6,7 @@ install:
   - pip install .
   - pip install pytest-cov
   - pip install python-coveralls
+  - pip install -r test_requirements.txt
 script:
   - pylint -E persephone
   - mypy persephone

diff --git a/persephone/corpus.py b/persephone/corpus.py
@@ -81,6 +81,8 @@ def __init__(self, feat_type, label_type, tgt_dir, labels,
                 included in the corpus.
 
         """
+        if speakers:
+            raise NotImplementedError("Speakers not implemented")
 
         logger.debug("Creating a new Corpus object with feature type %s, label type %s,"
                      "target directory %s, label set %s, max_samples %d, speakers %s",
@@ -151,29 +153,34 @@ def from_elan(cls: Type[CorpusT], org_dir: Path, tgt_dir: Path,
                 codeswitched utterances.
             label_segmenter: An object that has an attribute `segment_labels`,
                 which is creates new `Utterance` instances from old ones,
-                by segmenting the tokens in their `text attribute. Note,
+                by segmenting the tokens in their `text` attribute. Note,
                 `LabelSegmenter` might be better as a function, the only issue
                 is it needs to carry with it a list of labels. This could
                 potentially be a function attribute.
-            speakers: A list of speakers to filter for. If None, utterances
+            speakers: A list of speakers to filter for. If `None`, utterances
                 from speakers are.
             tier_prefixes: A collection of strings that prefix ELAN tiers to
-                filter for. For example, if this is ("xv", "rf"), then tiers
+                filter for. For example, if this is `("xv", "rf")`, then tiers
                 named "xv", "xv@Mark", "rf@Rose" would be extracted if they
                 existed.
 
         """
+        # This currently bails out if label_segmenter is not provided
+        if not label_segmenter:
+            raise ValueError("A label segmenter must be provided via label_segmenter")
 
         # Read utterances from org_dir.
         utterances = elan.utterances_from_dir(org_dir,
                                               tier_prefixes=tier_prefixes)
 
         # Filter utterances based on some criteria (such as codeswitching).
-        utterances = [utter for utter in utterances if utterance_filter(utter)]
+        if utterance_filter:
+            utterances = [utter for utter in utterances if utterance_filter(utter)]
         utterances = utterance.remove_duplicates(utterances)
 
         # Segment the labels in the utterances appropriately
-        utterances = [label_segmenter.segment_labels(utter) for utter in utterances]
+        if label_segmenter:
+            utterances = [label_segmenter.segment_labels(utter) for utter in utterances]
 
         # Remove utterances without transcriptions.
         utterances = utterance.remove_empty_text(utterances)

diff --git a/persephone/corpus_reader.py b/persephone/corpus_reader.py
@@ -24,7 +24,9 @@ class CorpusReader:
     rand = True
 
     def __init__(self, corpus, num_train=None, batch_size=None, max_samples=None, rand_seed=0):
-        """ corpus: The Corpus object that interfaces with a given corpus.
+        """ Construct a new `CorpusReader` instance.
+            
+            corpus: The Corpus object that interfaces with a given corpus.
             num_train: The number of training instances from the corpus used.
             batch_size: The size of the batches to yield. If None, then it is
                         num_train / 32.0.

diff --git a/persephone/model.py b/persephone/model.py
diff --git a/persephone/results.py b/persephone/results.py
@@ -18,6 +18,10 @@ def filter_labels(sent: Sequence[str], labels: Set[str] = None) -> List[str]:
 def filtered_error_rate(hyps_path: Union[str, Path], refs_path: Union[str, Path], labels: Set[str]) -> float:
     """ Returns the error rate of hypotheses in hyps_path against references in refs_path after filtering only for labels in labels.
     """
+    if isinstance(hyps_path, Path):
+        hyps_path = str(hyps_path)
+    if isinstance(refs_path, Path):
+        refs_path = str(refs_path)
 
     with open(hyps_path) as hyps_f:
         lines = hyps_f.readlines()
@@ -29,8 +33,9 @@ def filtered_error_rate(hyps_path: Union[str, Path], refs_path: Union[str, Path]
     # For the case where there are no tokens left after filtering.
     only_empty = True
     for entry in hyps:
-        if entry != []:
+        if entry is not []:
             only_empty = False
+            break # found something so can move on immediately
     if only_empty:
         return -1
 

diff --git a/persephone/rnn_ctc.py b/persephone/rnn_ctc.py
@@ -15,16 +15,17 @@ def lstm_cell(hidden_size):
 class Model(model.Model):
     """ An acoustic model with a LSTM/CTC architecture. """
 
-    def write_desc(self):
+    def write_desc(self) -> None:
         """ Writes a description of the model to the exp_dir. """
 
         path = os.path.join(self.exp_dir, "model_description.txt")
         with open(path, "w") as desc_f:
             for key, val in self.__dict__.items():
                 print("%s=%s" % (key, val), file=desc_f)
 
-    def __init__(self, exp_dir, corpus_reader, num_layers=3,
-                 hidden_size=250, beam_width=100, decoding_merge_repeated=True):
+    def __init__(self, exp_dir, corpus_reader, num_layers: int = 3,
+                 hidden_size: int=250, beam_width: int = 100,
+                 decoding_merge_repeated: bool = True) -> None:
         super().__init__(exp_dir, corpus_reader)
 
         if not os.path.isdir(exp_dir):
@@ -55,7 +56,7 @@ def __init__(self, exp_dir, corpus_reader, num_layers=3,
 
         for i in range(num_layers):
 
-            with tf.variable_scope("layer_%d" % i):
+            with tf.variable_scope("layer_%d" % i): #type: ignore
 
                 cell_fw = lstm_cell(self.hidden_size)
                 cell_bw = lstm_cell(self.hidden_size)
@@ -65,22 +66,22 @@ def __init__(self, exp_dir, corpus_reader, num_layers=3,
                         time_major=False)
 
                 # Self outputs now becomes [batch_num, time, hidden_size*2]
-                self.outputs_concat = tf.concat((self.out_fw, self.out_bw), 2)
+                self.outputs_concat = tf.concat((self.out_fw, self.out_bw), 2) #type: ignore
 
                 # For feeding into the next layer
                 layer_input = self.outputs_concat
 
-        self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size*2])
+        self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size*2]) #type: ignore
 
         # Single-variable names are appropriate for weights an biases.
         # pylint: disable=invalid-name
         W = tf.Variable(tf.truncated_normal([hidden_size*2, vocab_size],
-                stddev=np.sqrt(2.0 / (2*hidden_size))))
-        b = tf.Variable(tf.zeros([vocab_size]))
-        self.logits = tf.matmul(self.outputs, W) + b
-        self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size])
+                stddev=np.sqrt(2.0 / (2*hidden_size)))) #type: ignore
+        b = tf.Variable(tf.zeros([vocab_size])) #type: ignore
+        self.logits = tf.matmul(self.outputs, W) + b #type: ignore
+        self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size]) #type: ignore
         # igormq made it time major, because of an optimization in ctc_loss.
-        self.logits = tf.transpose(self.logits, (1, 0, 2), name="logits")
+        self.logits = tf.transpose(self.logits, (1, 0, 2), name="logits") #type: ignore
 
         # For lattice construction
         self.log_softmax = tf.nn.log_softmax(self.logits)
@@ -97,9 +98,9 @@ def __init__(self, exp_dir, corpus_reader, num_layers=3,
         self.loss = tf.nn.ctc_loss(self.batch_y, self.logits, self.batch_x_lens,
                 preprocess_collapse_repeated=False, ctc_merge_repeated=True)
         self.cost = tf.reduce_mean(self.loss)
-        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)
+        self.optimizer = tf.train.AdamOptimizer().minimize(self.cost) #type: ignore
 
         self.ler = tf.reduce_mean(tf.edit_distance(
-                tf.cast(self.decoded[0], tf.int32), self.batch_y))
+                tf.cast(self.decoded[0], tf.int32), self.batch_y)) #type: ignore
 
         self.write_desc()
diff --git a/setup.py b/setup.py
@@ -27,8 +27,6 @@
            'tensorflow==1.4.1',
            'scikit-learn==0.19.1',
            'pympi-ling==1.69',
-           'pylint==1.8.2',
-           'mypy==0.560',
            'pydub==0.20.0',
            'pint==0.8.1',
       ],

diff --git a/stubs/tensorflow/__init__.pyi b/stubs/tensorflow/__init__.pyi
@@ -1,6 +1,39 @@
-from typing import Any
+from typing import Any, Dict, Optional
 
+from . import errors
 from . import train
+from . import nn
+
+class dtype: ...
+
+
+# Integer types
+class number(): ...
+class integer(number): ...
+class signedinteger(integer): ...
+class int8(signedinteger): ...
+class int16(signedinteger): ...
+class int32(signedinteger): ...
+class int64(signedinteger): ...
+
+class unsignedinteger(integer): ...
+class uint8(unsignedinteger): ...
+class uint16(unsignedinteger): ...
+class uint32(unsignedinteger): ...
+class uint64(unsignedinteger): ...
+
+class floating(number): ...
+class float16(floating): ...
+class float32(floating): ...
+class float64(floating): ...
+
+
+class dtypes:
+    float32 = float32
+
+
+class Tensor:
+    pass
 
 class gpu_options:
     def __init__(self):
@@ -14,10 +47,85 @@ class ConfigProto:
 class Graph:
     pass
 
-class Session:
-    def __init__(self, graph: Graph = None) -> None:
+class BaseSession:
+    #TODO: options is of type RunOption, run_metadata is of type RunMetadata
+    # Return type is option of:
+    # single graph element if fetches is a single graph element  OR
+    # list of graph elements if fetches is a list of single graph elements OR
+    # a dictionary
+    # Leaving it as Any for now
+    def run(self, fetches: Any, feed_dict: Optional[Dict[Any, Any]] = None, run_options: Any = None, run_metadata: Any = None) -> Any: ...
+
+    def close(self) -> None: ...
+
+class Session(BaseSession):
+    def __init__(self, graph: Graph = None, config: ConfigProto = None) -> None:
         pass
     def __enter__(self):
         pass
     def __exit__(self, type, value, traceback):
         pass
+    def close(self) -> None: ...
+
+# defined here https://github.com/tensorflow/tensorflow/blob/d8f9538ab48e3c677aaf532769d29bc29a05b76e/tensorflow/python/ops/variables.py#L40
+class Variable:
+    def __init__(self,
+                initial_value: Any=None,
+                trainable: Optional[bool]=True,
+                collections: Optional[Any]=None,
+                validate_shape: Optional[bool]=True,
+                caching_device: Optional[Any]=None,
+                name: Optional[str]=None,
+                variable_def: Optional[Any]=None,
+                dtype: Optional[Any]=None,
+                expected_shape: Optional[Any]=None,
+                import_scope: Optional[str]=None,
+                constraint: Optional[Any]=None) -> None : ...
+
+
+# Original function definition for edit_distance here:
+# https://github.com/tensorflow/tensorflow/blob/faff6f2a60a01dba57cf3a3ab832279dbe174798/tensorflow/python/ops/array_ops.py#L2049
+# return type is Tensor
+def edit_distance(hypothesis: Any, truth: Any, normalize: Optional[bool]=True, name: Optional[str]="edit_distance") -> Any: ...
+
+# Original function definition for global_variables_initializer here:
+# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/variables.py#L1565
+def global_variables_initializer() -> Any: ...
+
+# Original function definition for reset_default_graph here:
+# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/framework/ops.py#L5531
+def reset_default_graph() -> Graph: ...
+
+
+# Original function definition for placeholder here:
+# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/array_ops.py#L1693
+# TODO: improve types
+def placeholder(dtype: Any, shape: Any = None, name: Optional[str] = None) -> Any: ...
+
+# Original function definition for sparse_placeholder here:
+# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/array_ops.py#L1749
+# TODO: improve types
+def sparse_placeholder(dtype: Any, shape: Any = None, name: Optional[str] = None) -> Any: ...
+
+# Original function definition for sparse_tensor_to_dense here:
+# https://github.com/tensorflow/tensorflow/blob/d8f9538ab48e3c677aaf532769d29bc29a05b76e/tensorflow/python/ops/sparse_ops.py#L948
+# sp_input is SparseTensor
+# returns Tensor
+def sparse_tensor_to_dense(sp_input: Any, default_value: Any=0, validate_indices: bool=True, name: Optional[str]=None) -> Any: ...
+
+# Original function definition for shape here:
+# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/array_ops.py#L197
+# TODO: improve types. return type of None here is a hack
+# input is `Tensor` or `SparseTensor`
+# out_type is an optional integral data-type (`int32` or `int64`).
+# returns a `Tensor` of type specified by `out_type`
+def shape(input: Any, name: Optional[str] = None, out_type: Any = None) -> Any: ...
+
+# Original function definition for truncated_normal here:
+# https://github.com/tensorflow/tensorflow/blob/70cd9ed2d2ea37a6da6f813a99b32c03e90736a4/tensorflow/python/ops/random_ops.py#L139
+def truncated_normal(shape: Any, mean: Any=0.0, stddev: Any=1.0, dtype: Any=dtypes.float32, seed: Any=None, name: Optional[str]=None) -> Any: ...
+
+# Original function definition for reduce_mean here:
+# https://github.com/tensorflow/tensorflow/blob/3f8febf04b075eef0950a18c7e122f0addeacfe9/tensorflow/python/ops/math_ops.py#L1384
+# Returns Tensor
+def reduce_mean(input_tensor: Any, axis: Any=None, keepdims: Any=None, name: Optional[str]=None, reduction_indices: Any=None, keep_dims: Any=None) -> Any: ...
diff --git a/stubs/tensorflow/errors/__init__.pyi b/stubs/tensorflow/errors/__init__.pyi
@@ -0,0 +1,7 @@
+# Base tensorflow exception class
+# implemented here: https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/framework/errors_impl.py#L32
+class OpError(Exception): ...
+
+
+class ResourceExhaustedError(OpError): ...
+
diff --git a/stubs/tensorflow/nn/__init__.pyi b/stubs/tensorflow/nn/__init__.pyi
@@ -0,0 +1,29 @@
+from typing import Any, Optional, Tuple
+
+# ctc_beam_search_decoder implemented here:
+# https://github.com/tensorflow/tensorflow/blob/bb4e724f429ae5c9afad3a343dc1f483ecde1f74/tensorflow/python/ops/ctc_ops.py#L234
+def ctc_beam_search_decoder(inputs : Any , sequence_length: Any, beam_width: int =100,
+                            top_paths: int = 1, merge_repeated: bool = True) -> Tuple[Any, Any]: ...
+
+# bidirectional_dynamic_rnn implemented here:
+# https://github.com/tensorflow/tensorflow/blob/d8f9538ab48e3c677aaf532769d29bc29a05b76e/tensorflow/python/ops/rnn.py#L314
+# TODO: types
+# scope VariableScope
+def bidirectional_dynamic_rnn(cell_fw: Any, cell_bw: Any, inputs: Any, sequence_length: Any = None,
+                          initial_state_fw: Any = None, initial_state_bw: Any = None,
+                          dtype: Any = None, parallel_iterations: Optional[int] = None,
+                          swap_memory: Optional[bool]=False, time_major:Optional[bool]=False, scope: Any=None) -> Tuple[Any, Any]: ...
+
+# ctc_loss implemented here:
+# https://github.com/tensorflow/tensorflow/blob/bb4e724f429ae5c9afad3a343dc1f483ecde1f74/tensorflow/python/ops/ctc_ops.py#L32
+# TODO: types
+def ctc_loss(labels: Any, inputs: Any, sequence_length: Any,
+             preprocess_collapse_repeated: bool=False,
+             ctc_merge_repeated: bool=True, ignore_longer_outputs_than_inputs: bool=False,
+             time_major: bool=True) -> Any: ...
+
+# log_softmax implemented here:
+# https://github.com/tensorflow/tensorflow/blob/95c8f92947c6a420b70759d9d0d7825f2f5de368/tensorflow/python/ops/nn_ops.py#L1741
+# TODO: types
+# Returns Tensor
+def log_softmax(logits: Any, axis: Optional[int] = None, name: Optional[str]=None, dim: Optional[int]=None) -> Any: ...
diff --git a/stubs/tensorflow/train/__init__.pyi b/stubs/tensorflow/train/__init__.pyi
@@ -1,8 +1,17 @@
-from typing import Any
+from typing import Any, Optional
 
 def import_meta_graph(path: str) -> Any:
     pass
 
+# Saver class defined here
+# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/training/saver.py#L1075
 class Saver:
     def restore(self, session: Any, path: str) -> None:
         pass
+
+    # TODO: parameter types:
+    #    sess is of type Session
+    #    global_step is of type Tensor or integer
+    def save(self, sess: Any, save_path: str, global_step: Any = None, latest_filename: Optional[str]=None,
+             meta_graph_suffix: str = "meta", write_meta_graph: bool = True, write_state: bool = True,
+             strip_default_attrs: bool = False) -> Optional[str]: ...
diff --git a/test-requirements.txt b/test-requirements.txt
diff --git a/test_requirements.txt b/test_requirements.txt
@@ -0,0 +1,4 @@
+tox
+pylint>1.8
+pytest
+mypy>=0.6