From 2c87299fb4d7b83db212759a77aeff0fbcb2ebd9 Mon Sep 17 00:00:00 2001
From: tremblerz <abhishek.s14@iiits.in>
Date: Mon, 11 Jun 2018 18:13:11 -0700
Subject: [PATCH] Fix vocab file name and change vocabsize in params

---
 translation/tensorflow/process_data.py              |  2 +-
 .../tensorflow/transformer/model/model_params.py    |  2 +-
 .../tensorflow/transformer/utils/tokenizer.py       | 13 +++++++------
 3 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/translation/tensorflow/process_data.py b/translation/tensorflow/process_data.py
index 4781ef339..ac89be396 100644
--- a/translation/tensorflow/process_data.py
+++ b/translation/tensorflow/process_data.py
@@ -70,7 +70,7 @@
 # Vocabulary constants
 _TARGET_VOCAB_SIZE = 32768  # Number of subtokens in the vocabulary list.
 _TARGET_THRESHOLD = 327  # Accept vocabulary if size is within this threshold
-_VOCAB_FILE = "vocab.ende.%d" % _TARGET_VOCAB_SIZE
+_VOCAB_FILE = "vocab.ende."
 
 # Strings to inclue in the generated files.
 _PREFIX = "wmt32k"
diff --git a/translation/tensorflow/transformer/model/model_params.py b/translation/tensorflow/transformer/model/model_params.py
index aaa5bf26a..9b6cb53ba 100644
--- a/translation/tensorflow/transformer/model/model_params.py
+++ b/translation/tensorflow/transformer/model/model_params.py
@@ -23,7 +23,7 @@ class TransformerBaseParams(object):
 
   # Model params
   initializer_gain = 1.0  # Used in trainable variable initialization.
-  vocab_size = 33708  # Number of tokens defined in the vocabulary file.
+  vocab_size = 33945  # Number of tokens defined in the vocabulary file.
   hidden_size = 512  # Model dimension in the hidden layers.
   num_hidden_layers = 6  # Number of layers in the encoder and decoder stacks.
   num_heads = 8  # Number of heads to use in multi-headed attention.
diff --git a/translation/tensorflow/transformer/utils/tokenizer.py b/translation/tensorflow/transformer/utils/tokenizer.py
index 3e2140b72..69cf45379 100644
--- a/translation/tensorflow/transformer/utils/tokenizer.py
+++ b/translation/tensorflow/transformer/utils/tokenizer.py
@@ -117,6 +117,7 @@ def init_from_files(
           reserved_tokens)
       tf.logging.info("Generated vocabulary with %d subtokens." %
                       len(subtoken_list))
+      vocab_file += str(len(subtoken_list))
       _save_vocab_file(vocab_file, subtoken_list)
     return Subtokenizer(vocab_file)
 
@@ -393,12 +394,6 @@ def _generate_subtokens_with_target_vocab_size(
   if reserved_tokens is None:
     reserved_tokens = RESERVED_TOKENS
 
-  if min_count is not None:
-    tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
-                    (min_count, target_size))
-    return _generate_subtokens(
-        token_counts, alphabet, min_count, reserved_tokens=reserved_tokens)
-
   def bisect(min_val, max_val):
     """Recursive function to binary search for subtoken vocabulary."""
     cur_count = (min_val + max_val) // 2
@@ -425,6 +420,12 @@ def bisect(min_val, max_val):
       return other_subtoken_list
     return subtoken_list
 
+  if min_count is not None:
+    tf.logging.info("Using min_count=%d to generate vocab with target size %d" %
+                    (min_count, target_size))
+    # Perform binary search with enforced minimum value of min_count
+    return bisect(min_count, _MAX_MIN_COUNT)
+
   tf.logging.info("Finding best min_count to get target size of %d" %
                   target_size)
   return bisect(_MIN_MIN_COUNT, _MAX_MIN_COUNT)