Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion translation/tensorflow/process_data.py
Original file line number Diff line number Diff line change
Expand Up @@ -70,7 +70,7 @@
# Vocabulary constants
_TARGET_VOCAB_SIZE = 32768 # Number of subtokens in the vocabulary list.
_TARGET_THRESHOLD = 327 # Accept vocabulary if size is within this threshold
_VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE
_VOCAB_FILE = "vocab.ende."

# Strings to inclue in the generated files.
_PREFIX = "wmt32k"
Expand Down
2 changes: 1 addition & 1 deletion translation/tensorflow/transformer/model/model_params.py
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,7 @@ class TransformerBaseParams(object):

# Model params
initializer_gain = 1.0 # Used in trainable variable initialization.
vocab_size = 33708 # Number of tokens defined in the vocabulary file.
vocab_size = 33945 # Number of tokens defined in the vocabulary file.
hidden_size = 512 # Model dimension in the hidden layers.
num_hidden_layers = 6 # Number of layers in the encoder and decoder stacks.
num_heads = 8 # Number of heads to use in multi-headed attention.
Expand Down
13 changes: 7 additions & 6 deletions translation/tensorflow/transformer/utils/tokenizer.py
Original file line number Diff line number Diff line change
Expand Up @@ -117,6 +117,7 @@ def init_from_files(
reserved_tokens)
tf.logging.info("Generated vocabulary with %d subtokens." %
len(subtoken_list))
vocab_file += str(len(subtoken_list))
_save_vocab_file(vocab_file, subtoken_list)
return Subtokenizer(vocab_file)

Expand Down Expand Up @@ -393,12 +394,6 @@ def _generate_subtokens_with_target_vocab_size(
if reserved_tokens is None:
reserved_tokens = RESERVED_TOKENS

if min_count is not None:
tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
(min_count, target_size))
return _generate_subtokens(
token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)

def bisect(min_val, max_val):
"""Recursive function to binary search for subtoken vocabulary."""
cur_count = (min_val + max_val) // 2
Expand All @@ -425,6 +420,12 @@ def bisect(min_val, max_val):
return other_subtoken_list
return subtoken_list

if min_count is not None:
tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
(min_count, target_size))
# Perform binary search with enforced minimum value of min_count
return bisect(min_count, _MAX_MIN_COUNT)

tf.logging.info("Finding best min_count to get target size of %d" %
target_size)
return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)
Expand Down