mlcommons · tremblerz · Jun 12, 2018 · Jun 12, 2018
@@ -70,7 +70,7 @@
 # Vocabulary constants
 _TARGET_VOCAB_SIZE = 32768  # Number of subtokens in the vocabulary list.
 _TARGET_THRESHOLD = 327  # Accept vocabulary if size is within this threshold
-_VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE
+_VOCAB_FILE = "vocab.ende."
 
 # Strings to inclue in the generated files.
 _PREFIX = "wmt32k"

@@ -23,7 +23,7 @@ class TransformerBaseParams(object):
 
   # Model params
   initializer_gain = 1.0  # Used in trainable variable initialization.
-  vocab_size = 33708  # Number of tokens defined in the vocabulary file.
+  vocab_size = 33945  # Number of tokens defined in the vocabulary file.
   hidden_size = 512  # Model dimension in the hidden layers.
   num_hidden_layers = 6  # Number of layers in the encoder and decoder stacks.
   num_heads = 8  # Number of heads to use in multi-headed attention.

@@ -117,6 +117,7 @@ def init_from_files(
           reserved_tokens)
       tf.logging.info("Generated vocabulary with %d subtokens." %
                       len(subtoken_list))
+      vocab_file += str(len(subtoken_list))
       _save_vocab_file(vocab_file, subtoken_list)
     return Subtokenizer(vocab_file)
 
@@ -393,12 +394,6 @@ def _generate_subtokens_with_target_vocab_size(
   if reserved_tokens is None:
     reserved_tokens = RESERVED_TOKENS
 
-  if min_count is not None:
-    tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
-                    (min_count, target_size))
-    return _generate_subtokens(
-        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
-
   def bisect(min_val, max_val):
     """Recursive function to binary search for subtoken vocabulary."""
     cur_count = (min_val + max_val) // 2
@@ -425,6 +420,12 @@ def bisect(min_val, max_val):
       return other_subtoken_list
     return subtoken_list
 
+  if min_count is not None:
+    tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
+                    (min_count, target_size))
+    # Perform binary search with enforced minimum value of min_count
+    return bisect(min_count, _MAX_MIN_COUNT)
+
   tf.logging.info("Finding best min_count to get target size of %d" %
                   target_size)
   return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)