Skip to content

Commit

Permalink
Merge pull request #147 from customprogrammingsolutions/refactor-model
Browse files Browse the repository at this point in the history
[MRG] Clean up Model
  • Loading branch information
oadams committed May 28, 2018
2 parents aa69575 + 4da960c commit cfe5096
Show file tree
Hide file tree
Showing 13 changed files with 383 additions and 196 deletions.
1 change: 1 addition & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ install:
- pip install .
- pip install pytest-cov
- pip install python-coveralls
- pip install -r test_requirements.txt
script:
- pylint -E persephone
- mypy persephone
Expand Down
17 changes: 12 additions & 5 deletions persephone/corpus.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,8 @@ def __init__(self, feat_type, label_type, tgt_dir, labels,
included in the corpus.
"""
if speakers:
raise NotImplementedError("Speakers not implemented")

logger.debug("Creating a new Corpus object with feature type %s, label type %s,"
"target directory %s, label set %s, max_samples %d, speakers %s",
Expand Down Expand Up @@ -151,29 +153,34 @@ def from_elan(cls: Type[CorpusT], org_dir: Path, tgt_dir: Path,
codeswitched utterances.
label_segmenter: An object that has an attribute `segment_labels`,
which is creates new `Utterance` instances from old ones,
by segmenting the tokens in their `text attribute. Note,
by segmenting the tokens in their `text` attribute. Note,
`LabelSegmenter` might be better as a function, the only issue
is it needs to carry with it a list of labels. This could
potentially be a function attribute.
speakers: A list of speakers to filter for. If None, utterances
speakers: A list of speakers to filter for. If `None`, utterances
from speakers are.
tier_prefixes: A collection of strings that prefix ELAN tiers to
filter for. For example, if this is ("xv", "rf"), then tiers
filter for. For example, if this is `("xv", "rf")`, then tiers
named "xv", "xv@Mark", "rf@Rose" would be extracted if they
existed.
"""
# This currently bails out if label_segmenter is not provided
if not label_segmenter:
raise ValueError("A label segmenter must be provided via label_segmenter")

# Read utterances from org_dir.
utterances = elan.utterances_from_dir(org_dir,
tier_prefixes=tier_prefixes)

# Filter utterances based on some criteria (such as codeswitching).
utterances = [utter for utter in utterances if utterance_filter(utter)]
if utterance_filter:
utterances = [utter for utter in utterances if utterance_filter(utter)]
utterances = utterance.remove_duplicates(utterances)

# Segment the labels in the utterances appropriately
utterances = [label_segmenter.segment_labels(utter) for utter in utterances]
if label_segmenter:
utterances = [label_segmenter.segment_labels(utter) for utter in utterances]

# Remove utterances without transcriptions.
utterances = utterance.remove_empty_text(utterances)
Expand Down
4 changes: 3 additions & 1 deletion persephone/corpus_reader.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ class CorpusReader:
rand = True

def __init__(self, corpus, num_train=None, batch_size=None, max_samples=None, rand_seed=0):
""" corpus: The Corpus object that interfaces with a given corpus.
""" Construct a new `CorpusReader` instance.
corpus: The Corpus object that interfaces with a given corpus.
num_train: The number of training instances from the corpus used.
batch_size: The size of the batches to yield. If None, then it is
num_train / 32.0.
Expand Down
350 changes: 186 additions & 164 deletions persephone/model.py

Large diffs are not rendered by default.

7 changes: 6 additions & 1 deletion persephone/results.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,10 @@ def filter_labels(sent: Sequence[str], labels: Set[str] = None) -> List[str]:
def filtered_error_rate(hyps_path: Union[str, Path], refs_path: Union[str, Path], labels: Set[str]) -> float:
""" Returns the error rate of hypotheses in hyps_path against references in refs_path after filtering only for labels in labels.
"""
if isinstance(hyps_path, Path):
hyps_path = str(hyps_path)
if isinstance(refs_path, Path):
refs_path = str(refs_path)

with open(hyps_path) as hyps_f:
lines = hyps_f.readlines()
Expand All @@ -29,8 +33,9 @@ def filtered_error_rate(hyps_path: Union[str, Path], refs_path: Union[str, Path]
# For the case where there are no tokens left after filtering.
only_empty = True
for entry in hyps:
if entry != []:
if entry is not []:
only_empty = False
break # found something so can move on immediately
if only_empty:
return -1

Expand Down
27 changes: 14 additions & 13 deletions persephone/rnn_ctc.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,16 +15,17 @@ def lstm_cell(hidden_size):
class Model(model.Model):
""" An acoustic model with a LSTM/CTC architecture. """

def write_desc(self):
def write_desc(self) -> None:
""" Writes a description of the model to the exp_dir. """

path = os.path.join(self.exp_dir, "model_description.txt")
with open(path, "w") as desc_f:
for key, val in self.__dict__.items():
print("%s=%s" % (key, val), file=desc_f)

def __init__(self, exp_dir, corpus_reader, num_layers=3,
hidden_size=250, beam_width=100, decoding_merge_repeated=True):
def __init__(self, exp_dir, corpus_reader, num_layers: int = 3,
hidden_size: int=250, beam_width: int = 100,
decoding_merge_repeated: bool = True) -> None:
super().__init__(exp_dir, corpus_reader)

if not os.path.isdir(exp_dir):
Expand Down Expand Up @@ -55,7 +56,7 @@ def __init__(self, exp_dir, corpus_reader, num_layers=3,

for i in range(num_layers):

with tf.variable_scope("layer_%d" % i):
with tf.variable_scope("layer_%d" % i): #type: ignore

cell_fw = lstm_cell(self.hidden_size)
cell_bw = lstm_cell(self.hidden_size)
Expand All @@ -65,22 +66,22 @@ def __init__(self, exp_dir, corpus_reader, num_layers=3,
time_major=False)

# Self outputs now becomes [batch_num, time, hidden_size*2]
self.outputs_concat = tf.concat((self.out_fw, self.out_bw), 2)
self.outputs_concat = tf.concat((self.out_fw, self.out_bw), 2) #type: ignore

# For feeding into the next layer
layer_input = self.outputs_concat

self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size*2])
self.outputs = tf.reshape(self.outputs_concat, [-1, self.hidden_size*2]) #type: ignore

# Single-variable names are appropriate for weights an biases.
# pylint: disable=invalid-name
W = tf.Variable(tf.truncated_normal([hidden_size*2, vocab_size],
stddev=np.sqrt(2.0 / (2*hidden_size))))
b = tf.Variable(tf.zeros([vocab_size]))
self.logits = tf.matmul(self.outputs, W) + b
self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size])
stddev=np.sqrt(2.0 / (2*hidden_size)))) #type: ignore
b = tf.Variable(tf.zeros([vocab_size])) #type: ignore
self.logits = tf.matmul(self.outputs, W) + b #type: ignore
self.logits = tf.reshape(self.logits, [batch_size, -1, vocab_size]) #type: ignore
# igormq made it time major, because of an optimization in ctc_loss.
self.logits = tf.transpose(self.logits, (1, 0, 2), name="logits")
self.logits = tf.transpose(self.logits, (1, 0, 2), name="logits") #type: ignore

# For lattice construction
self.log_softmax = tf.nn.log_softmax(self.logits)
Expand All @@ -97,9 +98,9 @@ def __init__(self, exp_dir, corpus_reader, num_layers=3,
self.loss = tf.nn.ctc_loss(self.batch_y, self.logits, self.batch_x_lens,
preprocess_collapse_repeated=False, ctc_merge_repeated=True)
self.cost = tf.reduce_mean(self.loss)
self.optimizer = tf.train.AdamOptimizer().minimize(self.cost)
self.optimizer = tf.train.AdamOptimizer().minimize(self.cost) #type: ignore

self.ler = tf.reduce_mean(tf.edit_distance(
tf.cast(self.decoded[0], tf.int32), self.batch_y))
tf.cast(self.decoded[0], tf.int32), self.batch_y)) #type: ignore

self.write_desc()
2 changes: 0 additions & 2 deletions setup.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,8 +27,6 @@
'tensorflow==1.4.1',
'scikit-learn==0.19.1',
'pympi-ling==1.69',
'pylint==1.8.2',
'mypy==0.560',
'pydub==0.20.0',
'pint==0.8.1',
],
Expand Down
114 changes: 111 additions & 3 deletions stubs/tensorflow/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,6 +1,39 @@
from typing import Any
from typing import Any, Dict, Optional

from . import errors
from . import train
from . import nn

class dtype: ...


# Integer types
class number(): ...
class integer(number): ...
class signedinteger(integer): ...
class int8(signedinteger): ...
class int16(signedinteger): ...
class int32(signedinteger): ...
class int64(signedinteger): ...

class unsignedinteger(integer): ...
class uint8(unsignedinteger): ...
class uint16(unsignedinteger): ...
class uint32(unsignedinteger): ...
class uint64(unsignedinteger): ...

class floating(number): ...
class float16(floating): ...
class float32(floating): ...
class float64(floating): ...


class dtypes:
float32 = float32


class Tensor:
pass

class gpu_options:
def __init__(self):
Expand All @@ -14,10 +47,85 @@ class ConfigProto:
class Graph:
pass

class Session:
def __init__(self, graph: Graph = None) -> None:
class BaseSession:
#TODO: options is of type RunOption, run_metadata is of type RunMetadata
# Return type is option of:
# single graph element if fetches is a single graph element OR
# list of graph elements if fetches is a list of single graph elements OR
# a dictionary
# Leaving it as Any for now
def run(self, fetches: Any, feed_dict: Optional[Dict[Any, Any]] = None, run_options: Any = None, run_metadata: Any = None) -> Any: ...

def close(self) -> None: ...

class Session(BaseSession):
def __init__(self, graph: Graph = None, config: ConfigProto = None) -> None:
pass
def __enter__(self):
pass
def __exit__(self, type, value, traceback):
pass
def close(self) -> None: ...

# defined here https://github.com/tensorflow/tensorflow/blob/d8f9538ab48e3c677aaf532769d29bc29a05b76e/tensorflow/python/ops/variables.py#L40
class Variable:
def __init__(self,
initial_value: Any=None,
trainable: Optional[bool]=True,
collections: Optional[Any]=None,
validate_shape: Optional[bool]=True,
caching_device: Optional[Any]=None,
name: Optional[str]=None,
variable_def: Optional[Any]=None,
dtype: Optional[Any]=None,
expected_shape: Optional[Any]=None,
import_scope: Optional[str]=None,
constraint: Optional[Any]=None) -> None : ...


# Original function definition for edit_distance here:
# https://github.com/tensorflow/tensorflow/blob/faff6f2a60a01dba57cf3a3ab832279dbe174798/tensorflow/python/ops/array_ops.py#L2049
# return type is Tensor
def edit_distance(hypothesis: Any, truth: Any, normalize: Optional[bool]=True, name: Optional[str]="edit_distance") -> Any: ...

# Original function definition for global_variables_initializer here:
# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/variables.py#L1565
def global_variables_initializer() -> Any: ...

# Original function definition for reset_default_graph here:
# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/framework/ops.py#L5531
def reset_default_graph() -> Graph: ...


# Original function definition for placeholder here:
# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/array_ops.py#L1693
# TODO: improve types
def placeholder(dtype: Any, shape: Any = None, name: Optional[str] = None) -> Any: ...

# Original function definition for sparse_placeholder here:
# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/array_ops.py#L1749
# TODO: improve types
def sparse_placeholder(dtype: Any, shape: Any = None, name: Optional[str] = None) -> Any: ...

# Original function definition for sparse_tensor_to_dense here:
# https://github.com/tensorflow/tensorflow/blob/d8f9538ab48e3c677aaf532769d29bc29a05b76e/tensorflow/python/ops/sparse_ops.py#L948
# sp_input is SparseTensor
# returns Tensor
def sparse_tensor_to_dense(sp_input: Any, default_value: Any=0, validate_indices: bool=True, name: Optional[str]=None) -> Any: ...

# Original function definition for shape here:
# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/ops/array_ops.py#L197
# TODO: improve types. return type of None here is a hack
# input is `Tensor` or `SparseTensor`
# out_type is an optional integral data-type (`int32` or `int64`).
# returns a `Tensor` of type specified by `out_type`
def shape(input: Any, name: Optional[str] = None, out_type: Any = None) -> Any: ...

# Original function definition for truncated_normal here:
# https://github.com/tensorflow/tensorflow/blob/70cd9ed2d2ea37a6da6f813a99b32c03e90736a4/tensorflow/python/ops/random_ops.py#L139
def truncated_normal(shape: Any, mean: Any=0.0, stddev: Any=1.0, dtype: Any=dtypes.float32, seed: Any=None, name: Optional[str]=None) -> Any: ...

# Original function definition for reduce_mean here:
# https://github.com/tensorflow/tensorflow/blob/3f8febf04b075eef0950a18c7e122f0addeacfe9/tensorflow/python/ops/math_ops.py#L1384
# Returns Tensor
def reduce_mean(input_tensor: Any, axis: Any=None, keepdims: Any=None, name: Optional[str]=None, reduction_indices: Any=None, keep_dims: Any=None) -> Any: ...
7 changes: 7 additions & 0 deletions stubs/tensorflow/errors/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
# Base tensorflow exception class
# implemented here: https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/framework/errors_impl.py#L32
class OpError(Exception): ...


class ResourceExhaustedError(OpError): ...

29 changes: 29 additions & 0 deletions stubs/tensorflow/nn/__init__.pyi
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
from typing import Any, Optional, Tuple

# ctc_beam_search_decoder implemented here:
# https://github.com/tensorflow/tensorflow/blob/bb4e724f429ae5c9afad3a343dc1f483ecde1f74/tensorflow/python/ops/ctc_ops.py#L234
def ctc_beam_search_decoder(inputs : Any , sequence_length: Any, beam_width: int =100,
top_paths: int = 1, merge_repeated: bool = True) -> Tuple[Any, Any]: ...

# bidirectional_dynamic_rnn implemented here:
# https://github.com/tensorflow/tensorflow/blob/d8f9538ab48e3c677aaf532769d29bc29a05b76e/tensorflow/python/ops/rnn.py#L314
# TODO: types
# scope VariableScope
def bidirectional_dynamic_rnn(cell_fw: Any, cell_bw: Any, inputs: Any, sequence_length: Any = None,
initial_state_fw: Any = None, initial_state_bw: Any = None,
dtype: Any = None, parallel_iterations: Optional[int] = None,
swap_memory: Optional[bool]=False, time_major:Optional[bool]=False, scope: Any=None) -> Tuple[Any, Any]: ...

# ctc_loss implemented here:
# https://github.com/tensorflow/tensorflow/blob/bb4e724f429ae5c9afad3a343dc1f483ecde1f74/tensorflow/python/ops/ctc_ops.py#L32
# TODO: types
def ctc_loss(labels: Any, inputs: Any, sequence_length: Any,
preprocess_collapse_repeated: bool=False,
ctc_merge_repeated: bool=True, ignore_longer_outputs_than_inputs: bool=False,
time_major: bool=True) -> Any: ...

# log_softmax implemented here:
# https://github.com/tensorflow/tensorflow/blob/95c8f92947c6a420b70759d9d0d7825f2f5de368/tensorflow/python/ops/nn_ops.py#L1741
# TODO: types
# Returns Tensor
def log_softmax(logits: Any, axis: Optional[int] = None, name: Optional[str]=None, dim: Optional[int]=None) -> Any: ...
11 changes: 10 additions & 1 deletion stubs/tensorflow/train/__init__.pyi
Original file line number Diff line number Diff line change
@@ -1,8 +1,17 @@
from typing import Any
from typing import Any, Optional

def import_meta_graph(path: str) -> Any:
pass

# Saver class defined here
# https://github.com/tensorflow/tensorflow/blob/28340a4b12e286fe14bb7ac08aebe325c3e150b4/tensorflow/python/training/saver.py#L1075
class Saver:
def restore(self, session: Any, path: str) -> None:
pass

# TODO: parameter types:
# sess is of type Session
# global_step is of type Tensor or integer
def save(self, sess: Any, save_path: str, global_step: Any = None, latest_filename: Optional[str]=None,
meta_graph_suffix: str = "meta", write_meta_graph: bool = True, write_state: bool = True,
strip_default_attrs: bool = False) -> Optional[str]: ...
6 changes: 0 additions & 6 deletions test-requirements.txt

This file was deleted.

4 changes: 4 additions & 0 deletions test_requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
tox
pylint>1.8
pytest
mypy>=0.6

0 comments on commit cfe5096

Please sign in to comment.