From 2c87299fb4d7b83db212759a77aeff0fbcb2ebd9 Mon Sep 17 00:00:00 2001 From: tremblerz Date: Mon, 11 Jun 2018 18:13:11 -0700 Subject: [PATCH] Fix vocab file name and change vocabsize in params --- translation/tensorflow/process_data.py | 2 +- .../tensorflow/transformer/model/model_params.py | 2 +- .../tensorflow/transformer/utils/tokenizer.py | 13 +++++++------ 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/translation/tensorflow/process_data.py b/translation/tensorflow/process_data.py index 4781ef339..ac89be396 100644 --- a/translation/tensorflow/process_data.py +++ b/translation/tensorflow/process_data.py @@ -70,7 +70,7 @@ # Vocabulary constants _TARGET_VOCAB_SIZE = 32768 # Number of subtokens in the vocabulary list. _TARGET_THRESHOLD = 327 # Accept vocabulary if size is within this threshold -_VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE +_VOCAB_FILE = "vocab.ende." # Strings to inclue in the generated files. _PREFIX = "wmt32k" diff --git a/translation/tensorflow/transformer/model/model_params.py b/translation/tensorflow/transformer/model/model_params.py index aaa5bf26a..9b6cb53ba 100644 --- a/translation/tensorflow/transformer/model/model_params.py +++ b/translation/tensorflow/transformer/model/model_params.py @@ -23,7 +23,7 @@ class TransformerBaseParams(object): # Model params initializer_gain = 1.0 # Used in trainable variable initialization. - vocab_size = 33708 # Number of tokens defined in the vocabulary file. + vocab_size = 33945 # Number of tokens defined in the vocabulary file. hidden_size = 512 # Model dimension in the hidden layers. num_hidden_layers = 6 # Number of layers in the encoder and decoder stacks. num_heads = 8 # Number of heads to use in multi-headed attention. diff --git a/translation/tensorflow/transformer/utils/tokenizer.py b/translation/tensorflow/transformer/utils/tokenizer.py index 3e2140b72..69cf45379 100644 --- a/translation/tensorflow/transformer/utils/tokenizer.py +++ b/translation/tensorflow/transformer/utils/tokenizer.py @@ -117,6 +117,7 @@ def init_from_files( reserved_tokens) tf.logging.info("Generated vocabulary with %d subtokens." % len(subtoken_list)) + vocab_file += str(len(subtoken_list)) _save_vocab_file(vocab_file, subtoken_list) return Subtokenizer(vocab_file) @@ -393,12 +394,6 @@ def _generate_subtokens_with_target_vocab_size( if reserved_tokens is None: reserved_tokens = RESERVED_TOKENS - if min_count is not None: - tf.logging.info("Using min_count=%d to generate vocab with target size %d" % - (min_count, target_size)) - return _generate_subtokens( - token_counts, alphabet, min_count, reserved_tokens=reserved_tokens) - def bisect(min_val, max_val): """Recursive function to binary search for subtoken vocabulary.""" cur_count = (min_val + max_val) // 2 @@ -425,6 +420,12 @@ def bisect(min_val, max_val): return other_subtoken_list return subtoken_list + if min_count is not None: + tf.logging.info("Using min_count=%d to generate vocab with target size %d" % + (min_count, target_size)) + # Perform binary search with enforced minimum value of min_count + return bisect(min_count, _MAX_MIN_COUNT) + tf.logging.info("Finding best min_count to get target size of %d" % target_size) return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)