From 875afb748c097a777c61c3f3e92007c9cfdbaca3 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 16 May 2019 18:29:16 +0000 Subject: [PATCH 01/77] fhfile --- TensorFlow/LanguageModeling/BERT/run_pretraining.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/run_pretraining.py b/TensorFlow/LanguageModeling/BERT/run_pretraining.py index 3da2c5506..f4742901b 100644 --- a/TensorFlow/LanguageModeling/BERT/run_pretraining.py +++ b/TensorFlow/LanguageModeling/BERT/run_pretraining.py @@ -491,8 +491,12 @@ def main(_): tf.gfile.MakeDirs(FLAGS.output_dir) input_files = [] - for input_pattern in FLAGS.input_file.split(","): - input_files.extend(tf.gfile.Glob(input_pattern)) + from gcloud.gcs import fhfile + if fhfile.IsDirectory(FLAGS.input_file): + input_files = list(fhfile.walk_path(FLAGS.input_file)) + else: + for input_pattern in FLAGS.input_file.split(","): + input_files.extend(tf.gfile.Glob(input_pattern)) tf.logging.info("*** Input Files ***") for input_file in input_files: From ad614a383224e1d9c036dad0d3ee3bc8af42fb16 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 16 May 2019 19:47:18 +0000 Subject: [PATCH 02/77] import walk path --- .../LanguageModeling/BERT/run_pretraining.py | 36 +++++++++++++++++-- 1 file changed, 33 insertions(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/run_pretraining.py b/TensorFlow/LanguageModeling/BERT/run_pretraining.py index f4742901b..bae0d7ce9 100644 --- a/TensorFlow/LanguageModeling/BERT/run_pretraining.py +++ b/TensorFlow/LanguageModeling/BERT/run_pretraining.py @@ -490,10 +490,40 @@ def main(_): tf.gfile.MakeDirs(FLAGS.output_dir) + def walk_path(location: str, + only_dir: bool = False, + depth: int = None, + extension: str = None): + """Walks through specified remote or local directory. + + Args: + location: local or remote directory to start walk. + only_dir: if True, only directories are yielded, + else only files. + depth: number of subdirectories to recursively walk through. + if unspecified, walk through all subdirectories. + extension: if specified, only files the end with this + extension are returned. + Yields: + local or remote path. + + """ + for level, (root, dirs, file_names) in enumerate( + tf.gfile.Walk(top=location)): + if only_dir: + for dir_name in dirs: + yield os.path.join(root, dir_name) + else: + for file_name in file_names: + if extension and not file_name.endswith(extension): + continue + yield os.path.join(root, file_name) + if depth is not None and depth == level: + return + input_files = [] - from gcloud.gcs import fhfile - if fhfile.IsDirectory(FLAGS.input_file): - input_files = list(fhfile.walk_path(FLAGS.input_file)) + if tf.gfile.Exists(FLAGS.input_file): + input_files = list(walk_path(FLAGS.input_file)) else: for input_pattern in FLAGS.input_file.split(","): input_files.extend(tf.gfile.Glob(input_pattern)) From 1b61541f14b455b9eacddfbaefc381035042edaf Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 23 May 2019 18:47:24 +0000 Subject: [PATCH 03/77] turn on amp if fp16 --- TensorFlow/LanguageModeling/BERT/run_squad.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index ff0f7b940..2f56fceeb 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -158,6 +158,10 @@ flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.") flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") +if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') + # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): def __init__(self, global_batch_size, hvd_rank=-1): From bef3ff6d6a779223c4758879e2e476344981205f Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 01:31:26 +0000 Subject: [PATCH 04/77] use wrapper instead of flag --- TensorFlow/LanguageModeling/BERT/optimization.py | 8 +++++--- TensorFlow/LanguageModeling/BERT/run_squad.py | 6 +++--- 2 files changed, 8 insertions(+), 6 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py index a1b912a8e..728e72378 100644 --- a/TensorFlow/LanguageModeling/BERT/optimization.py +++ b/TensorFlow/LanguageModeling/BERT/optimization.py @@ -75,9 +75,11 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, if hvd is not None: from horovod.tensorflow.compression import Compression optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none) - if use_fp16 or amp: - loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) - optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + #if use_fp16 or amp: + if use_fp16: + #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) + #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') tvars = tf.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, tvars) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index 2f56fceeb..a996b3b63 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -158,9 +158,9 @@ flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.") flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") -if FLAGS.use_fp16: - os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - print('Turning on AMP') +#if FLAGS.use_fp16: + #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + #print('Turning on AMP') # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): From fe58436b9a865280ac8339d4818f6888129c646c Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 02:46:38 +0000 Subject: [PATCH 05/77] switch back to nvidia --- TensorFlow/LanguageModeling/BERT/optimization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py index 728e72378..8b4864aa5 100644 --- a/TensorFlow/LanguageModeling/BERT/optimization.py +++ b/TensorFlow/LanguageModeling/BERT/optimization.py @@ -75,11 +75,11 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, if hvd is not None: from horovod.tensorflow.compression import Compression optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none) - #if use_fp16 or amp: - if use_fp16: - #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) - #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') + if use_fp16 or amp: + #if use_fp16: + loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) + optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + #optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') tvars = tf.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, tvars) From 0e148185dac98a7805535f959ceb5eed275a3209 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 02:47:04 +0000 Subject: [PATCH 06/77] switch back to nvidia amp --- TensorFlow/LanguageModeling/BERT/run_squad.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index a996b3b63..2f56fceeb 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -158,9 +158,9 @@ flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.") flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") -#if FLAGS.use_fp16: - #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - #print('Turning on AMP') +if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): From ecb43379b42f542e5b039ad9658a1682e75f427a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 18:42:37 +0000 Subject: [PATCH 07/77] tf amp --- TensorFlow/LanguageModeling/BERT/optimization.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py index 8b4864aa5..728e72378 100644 --- a/TensorFlow/LanguageModeling/BERT/optimization.py +++ b/TensorFlow/LanguageModeling/BERT/optimization.py @@ -75,11 +75,11 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, if hvd is not None: from horovod.tensorflow.compression import Compression optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none) - if use_fp16 or amp: - #if use_fp16: - loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) - optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) - #optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') + #if use_fp16 or amp: + if use_fp16: + #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) + #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') tvars = tf.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, tvars) From 83fc718479fc5975b3279ff82c6dc895f78db28e Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 18:53:21 +0000 Subject: [PATCH 08/77] turn amp off --- TensorFlow/LanguageModeling/BERT/run_squad.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index 2f56fceeb..a996b3b63 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -158,9 +158,9 @@ flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.") flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") -if FLAGS.use_fp16: - os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - print('Turning on AMP') +#if FLAGS.use_fp16: + #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + #print('Turning on AMP') # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): From 94820a16fa6a3ee0c58d33472a8e3e382358463b Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 19:29:16 +0000 Subject: [PATCH 09/77] print --- TensorFlow/LanguageModeling/BERT/optimization.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py index 728e72378..e212c8560 100644 --- a/TensorFlow/LanguageModeling/BERT/optimization.py +++ b/TensorFlow/LanguageModeling/BERT/optimization.py @@ -79,6 +79,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, if use_fp16: #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + print('wrapping with enable mixed precision graph rewrite') optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') tvars = tf.trainable_variables() From 9ccaed7c91f3dd4d517628570894a40865387ec8 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 19:45:12 +0000 Subject: [PATCH 10/77] or amp --- TensorFlow/LanguageModeling/BERT/optimization.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py index e212c8560..6371f5ca8 100644 --- a/TensorFlow/LanguageModeling/BERT/optimization.py +++ b/TensorFlow/LanguageModeling/BERT/optimization.py @@ -75,8 +75,8 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, if hvd is not None: from horovod.tensorflow.compression import Compression optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none) - #if use_fp16 or amp: - if use_fp16: + if use_fp16 or amp: + #if use_fp16: #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) print('wrapping with enable mixed precision graph rewrite') From 651f03a7e40ddcef18c1ea0e37a3245b952da15a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 24 May 2019 23:55:03 +0000 Subject: [PATCH 11/77] turn on TF_XLA_FLAGS=--tf_xla_cpu_global_jit --- TensorFlow/LanguageModeling/BERT/run_squad.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index a996b3b63..ebef9a57a 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -162,6 +162,10 @@ #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" #print('Turning on AMP') +if FLAGS.horovod: + print('Turning on TF_XLA_FLAGS=--tf_xla_cpu_global_jit') + os.environ["TF_XLA_FLAGS"] = "--tf_xla_cpu_global_jit" + # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): def __init__(self, global_batch_size, hvd_rank=-1): From 77055481683b9d5344ae55cf3473bc62ba31dd1b Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 30 May 2019 01:39:31 +0000 Subject: [PATCH 12/77] use built in amp --- TensorFlow/LanguageModeling/BERT/optimization.py | 8 ++++---- TensorFlow/LanguageModeling/BERT/run_squad.py | 8 +++++--- 2 files changed, 9 insertions(+), 7 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py index 6371f5ca8..bfa7b0dd0 100644 --- a/TensorFlow/LanguageModeling/BERT/optimization.py +++ b/TensorFlow/LanguageModeling/BERT/optimization.py @@ -77,10 +77,10 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu, optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none) if use_fp16 or amp: #if use_fp16: - #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) - #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) - print('wrapping with enable mixed precision graph rewrite') - optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') + loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5) + optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager) + #print('wrapping with enable mixed precision graph rewrite') + #optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic') tvars = tf.trainable_variables() grads_and_vars = optimizer.compute_gradients(loss, tvars) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index ebef9a57a..ba45d923f 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -158,13 +158,15 @@ flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.") flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") -#if FLAGS.use_fp16: - #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - #print('Turning on AMP') +if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') +''' if FLAGS.horovod: print('Turning on TF_XLA_FLAGS=--tf_xla_cpu_global_jit') os.environ["TF_XLA_FLAGS"] = "--tf_xla_cpu_global_jit" +''' # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): From 8b36ab87cb3a70ec458dbbd4d1698dab8497a30b Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Tue, 4 Jun 2019 03:04:49 +0000 Subject: [PATCH 13/77] fine tune run file --- .../LanguageModeling/BERT/finetune_BERT.py | 1014 +++++++++++++++++ 1 file changed, 1014 insertions(+) create mode 100644 TensorFlow/LanguageModeling/BERT/finetune_BERT.py diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py new file mode 100644 index 000000000..0714d6a9b --- /dev/null +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -0,0 +1,1014 @@ +# coding=utf-8 +# Copyright 2018 The Google AI Language Team Authors. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""BERT finetuning runner.""" + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import collections +import csv +import os +import modeling +import optimization +import tokenization +import tensorflow as tf + +flags = tf.flags + +FLAGS = flags.FLAGS + +## Required parameters +flags.DEFINE_string( + "data_dir", None, + "The input data dir. Should contain the .tsv files (or other data files) " + "for the task.") + +flags.DEFINE_string( + "bert_config_file", None, + "The config json file corresponding to the pre-trained BERT model. " + "This specifies the model architecture.") + +flags.DEFINE_string("task_name", None, "The name of the task to train.") + +flags.DEFINE_string("vocab_file", None, + "The vocabulary file that the BERT model was trained on.") + +flags.DEFINE_string( + "output_dir", None, + "The output directory where the model checkpoints will be written.") + +## Other parameters + +flags.DEFINE_string( + "init_checkpoint", None, + "Initial checkpoint (usually from a pre-trained BERT model).") + +flags.DEFINE_bool( + "do_lower_case", True, + "Whether to lower case the input text. Should be True for uncased " + "models and False for cased models.") + +flags.DEFINE_integer( + "max_seq_length", 128, + "The maximum total input sequence length after WordPiece tokenization. " + "Sequences longer than this will be truncated, and sequences shorter " + "than this will be padded.") + +flags.DEFINE_bool("do_train", False, "Whether to run training.") + +flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") + +flags.DEFINE_bool( + "do_predict", False, + "Whether to run the model in inference mode on the test set.") + +flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.") + +flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.") + +flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") + +flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") + +flags.DEFINE_float("num_train_epochs", 3.0, + "Total number of training epochs to perform.") + +flags.DEFINE_float( + "warmup_proportion", 0.1, + "Proportion of training to perform linear learning rate warmup for. " + "E.g., 0.1 = 10% of training.") + +flags.DEFINE_integer("save_checkpoints_steps", 1000, + "How often to save the model checkpoint.") + +flags.DEFINE_integer("iterations_per_loop", 1000, + "How many steps to make in each estimator call.") + +flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.") + +tf.flags.DEFINE_string( + "tpu_name", None, + "The Cloud TPU to use for training. This should be either the name " + "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 " + "url.") + +tf.flags.DEFINE_string( + "tpu_zone", None, + "[Optional] GCE zone where the Cloud TPU is located in. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string( + "gcp_project", None, + "[Optional] Project name for the Cloud TPU-enabled project. If not " + "specified, we will attempt to automatically detect the GCE project from " + "metadata.") + +tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.") + +flags.DEFINE_integer( + "num_tpu_cores", 8, + "Only used if `use_tpu` is True. Total number of TPU cores to use.") + +flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.") + +flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") + + +class InputExample(object): + """A single training/test example for simple sequence classification.""" + + def __init__(self, guid, text_a, text_b=None, label=None): + """Constructs a InputExample. + + Args: + guid: Unique id for the example. + text_a: string. The untokenized text of the first sequence. For single + sequence tasks, only this sequence must be specified. + text_b: (Optional) string. The untokenized text of the second sequence. + Only must be specified for sequence pair tasks. + label: (Optional) string. The label of the example. This should be + specified for train and dev examples, but not for test examples. + """ + self.guid = guid + self.text_a = text_a + self.text_b = text_b + self.label = label + + +class PaddingInputExample(object): + """Fake example so the num input examples is a multiple of the batch size. + + When running eval/predict on the TPU, we need to pad the number of examples + to be a multiple of the batch size, because the TPU requires a fixed batch + size. The alternative is to drop the last batch, which is bad because it means + the entire output data won't be generated. + + We use this class instead of `None` because treating `None` as padding + battches could cause silent errors. + """ + + +class InputFeatures(object): + """A single set of features of data.""" + + def __init__(self, + input_ids, + input_mask, + segment_ids, + label_id, + is_real_example=True): + self.input_ids = input_ids + self.input_mask = input_mask + self.segment_ids = segment_ids + self.label_id = label_id + self.is_real_example = is_real_example + + +class DataProcessor(object): + """Base class for data converters for sequence classification data sets.""" + + def get_train_examples(self, data_dir): + """Gets a collection of `InputExample`s for the train set.""" + raise NotImplementedError() + + def get_dev_examples(self, data_dir): + """Gets a collection of `InputExample`s for the dev set.""" + raise NotImplementedError() + + def get_test_examples(self, data_dir): + """Gets a collection of `InputExample`s for prediction.""" + raise NotImplementedError() + + def get_labels(self): + """Gets the list of labels for this data set.""" + raise NotImplementedError() + + @classmethod + def _read_tsv(cls, input_file, quotechar=None): + """Reads a tab separated value file.""" + with tf.gfile.Open(input_file, "r") as f: + reader = csv.reader(f, delimiter="\t", quotechar=quotechar) + lines = [] + for line in reader: + lines.append(line) + return lines + + +class XnliProcessor(DataProcessor): + """Processor for the XNLI data set.""" + + def __init__(self): + self.language = "zh" + + def get_train_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv( + os.path.join(data_dir, "multinli", + "multinli.train.%s.tsv" % self.language)) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "train-%d" % (i) + text_a = tokenization.convert_to_unicode(line[0]) + text_b = tokenization.convert_to_unicode(line[1]) + label = tokenization.convert_to_unicode(line[2]) + if label == tokenization.convert_to_unicode("contradictory"): + label = tokenization.convert_to_unicode("contradiction") + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_dev_examples(self, data_dir): + """See base class.""" + lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "dev-%d" % (i) + language = tokenization.convert_to_unicode(line[0]) + if language != tokenization.convert_to_unicode(self.language): + continue + text_a = tokenization.convert_to_unicode(line[6]) + text_b = tokenization.convert_to_unicode(line[7]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + +class MnliProcessor(DataProcessor): + """Processor for the MultiNLI data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), + "dev_matched") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["contradiction", "entailment", "neutral"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) + text_a = tokenization.convert_to_unicode(line[8]) + text_b = tokenization.convert_to_unicode(line[9]) + if set_type == "test": + label = "contradiction" + else: + label = tokenization.convert_to_unicode(line[-1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class MrpcProcessor(DataProcessor): + """Processor for the MRPC data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + if i == 0: + continue + guid = "%s-%s" % (set_type, i) + text_a = tokenization.convert_to_unicode(line[3]) + text_b = tokenization.convert_to_unicode(line[4]) + if set_type == "test": + label = "0" + else: + label = tokenization.convert_to_unicode(line[0]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) + return examples + + +class ColaProcessor(DataProcessor): + """Processor for the CoLA data set (GLUE version).""" + + def get_train_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") + + def get_dev_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") + + def get_test_examples(self, data_dir): + """See base class.""" + return self._create_examples( + self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") + + def get_labels(self): + """See base class.""" + return ["0", "1"] + + def _create_examples(self, lines, set_type): + """Creates examples for the training and dev sets.""" + examples = [] + for (i, line) in enumerate(lines): + # Only the test set has a header + if set_type == "test" and i == 0: + continue + guid = "%s-%s" % (set_type, i) + if set_type == "test": + text_a = tokenization.convert_to_unicode(line[1]) + label = "0" + else: + text_a = tokenization.convert_to_unicode(line[3]) + label = tokenization.convert_to_unicode(line[1]) + examples.append( + InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) + return examples + + +class DummyProcessor(DataProcessor): + + def get_train_examples(self): + return self._create_examples(10, 'train') + + def get_dev_examples(self): + return self._create_examples(2, 'dev') + + def get_test_examples(self): + return self._create_examples(2, 'test') + + def get_labels(self): + return ["0", "1"] + + def _create_examples(self, num_lines, set_type): + examples = [] + for i in range(num_lines): + guid = "%s-%d" % (set_type, i) + examples.append(InputExample(guid=guid, text_a='dummy dummy, dummy', label='0')) + return examples + + +def convert_single_example(ex_index, example, label_list, max_seq_length, + tokenizer): + """Converts a single `InputExample` into a single `InputFeatures`.""" + + if isinstance(example, PaddingInputExample): + return InputFeatures( + input_ids=[0] * max_seq_length, + input_mask=[0] * max_seq_length, + segment_ids=[0] * max_seq_length, + label_id=0, + is_real_example=False) + + label_map = {} + for (i, label) in enumerate(label_list): + label_map[label] = i + + tokens_a = tokenizer.tokenize(example.text_a) + tokens_b = None + if example.text_b: + tokens_b = tokenizer.tokenize(example.text_b) + + if tokens_b: + # Modifies `tokens_a` and `tokens_b` in place so that the total + # length is less than the specified length. + # Account for [CLS], [SEP], [SEP] with "- 3" + _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) + else: + # Account for [CLS] and [SEP] with "- 2" + if len(tokens_a) > max_seq_length - 2: + tokens_a = tokens_a[0:(max_seq_length - 2)] + + # The convention in BERT is: + # (a) For sequence pairs: + # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] + # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 + # (b) For single sequences: + # tokens: [CLS] the dog is hairy . [SEP] + # type_ids: 0 0 0 0 0 0 0 + # + # Where "type_ids" are used to indicate whether this is the first + # sequence or the second sequence. The embedding vectors for `type=0` and + # `type=1` were learned during pre-training and are added to the wordpiece + # embedding vector (and position vector). This is not *strictly* necessary + # since the [SEP] token unambiguously separates the sequences, but it makes + # it easier for the model to learn the concept of sequences. + # + # For classification tasks, the first vector (corresponding to [CLS]) is + # used as the "sentence vector". Note that this only makes sense because + # the entire model is fine-tuned. + tokens = [] + segment_ids = [] + tokens.append("[CLS]") + segment_ids.append(0) + for token in tokens_a: + tokens.append(token) + segment_ids.append(0) + tokens.append("[SEP]") + segment_ids.append(0) + + if tokens_b: + for token in tokens_b: + tokens.append(token) + segment_ids.append(1) + tokens.append("[SEP]") + segment_ids.append(1) + + input_ids = tokenizer.convert_tokens_to_ids(tokens) + + # The mask has 1 for real tokens and 0 for padding tokens. Only real + # tokens are attended to. + input_mask = [1] * len(input_ids) + + # Zero-pad up to the sequence length. + while len(input_ids) < max_seq_length: + input_ids.append(0) + input_mask.append(0) + segment_ids.append(0) + + assert len(input_ids) == max_seq_length + assert len(input_mask) == max_seq_length + assert len(segment_ids) == max_seq_length + + label_id = label_map[example.label] + if ex_index < 5: + tf.logging.info("*** Example ***") + tf.logging.info("guid: %s" % (example.guid)) + tf.logging.info("tokens: %s" % " ".join( + [tokenization.printable_text(x) for x in tokens])) + tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) + tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) + tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) + tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) + + feature = InputFeatures( + input_ids=input_ids, + input_mask=input_mask, + segment_ids=segment_ids, + label_id=label_id, + is_real_example=True) + return feature + + +def file_based_convert_examples_to_features( + examples, label_list, max_seq_length, tokenizer, output_file): + """Convert a set of `InputExample`s to a TFRecord file.""" + + writer = tf.python_io.TFRecordWriter(output_file) + + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + def create_int_feature(values): + f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) + return f + + features = collections.OrderedDict() + features["input_ids"] = create_int_feature(feature.input_ids) + features["input_mask"] = create_int_feature(feature.input_mask) + features["segment_ids"] = create_int_feature(feature.segment_ids) + features["label_ids"] = create_int_feature([feature.label_id]) + features["is_real_example"] = create_int_feature( + [int(feature.is_real_example)]) + + tf_example = tf.train.Example(features=tf.train.Features(feature=features)) + writer.write(tf_example.SerializeToString()) + writer.close() + + +def file_based_input_fn_builder(input_file, seq_length, is_training, + drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + name_to_features = { + "input_ids": tf.FixedLenFeature([seq_length], tf.int64), + "input_mask": tf.FixedLenFeature([seq_length], tf.int64), + "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), + "label_ids": tf.FixedLenFeature([], tf.int64), + "is_real_example": tf.FixedLenFeature([], tf.int64), + } + + def _decode_record(record, name_to_features): + """Decodes a record to a TensorFlow example.""" + example = tf.parse_single_example(record, name_to_features) + + # tf.Example only supports tf.int64, but the TPU only supports tf.int32. + # So cast all int64 to int32. + for name in list(example.keys()): + t = example[name] + if t.dtype == tf.int64: + t = tf.to_int32(t) + example[name] = t + + return example + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + # For training, we want a lot of parallel reading and shuffling. + # For eval, we want no shuffling and parallel reading doesn't matter. + d = tf.data.TFRecordDataset(input_file) + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.apply( + tf.contrib.data.map_and_batch( + lambda record: _decode_record(record, name_to_features), + batch_size=batch_size, + drop_remainder=drop_remainder)) + + return d + + return input_fn + + +def _truncate_seq_pair(tokens_a, tokens_b, max_length): + """Truncates a sequence pair in place to the maximum length.""" + + # This is a simple heuristic which will always truncate the longer sequence + # one token at a time. This makes more sense than truncating an equal percent + # of tokens from each, since if one sequence is very short then each token + # that's truncated likely contains more information than a longer sequence. + while True: + total_length = len(tokens_a) + len(tokens_b) + if total_length <= max_length: + break + if len(tokens_a) > len(tokens_b): + tokens_a.pop() + else: + tokens_b.pop() + + +def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, + labels, num_labels, use_one_hot_embeddings): + """Creates a classification model.""" + model = modeling.BertModel( + config=bert_config, + is_training=is_training, + input_ids=input_ids, + input_mask=input_mask, + token_type_ids=segment_ids, + use_one_hot_embeddings=use_one_hot_embeddings, + compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32) + + last_layer = model.get_sequence_output() + extended_batch_size = tf.shape(last_layer)[0] + chunk_size = tf.shape(last_layer)[1] + depth = tf.shape(last_layer)[2] + batch_size = extended_batch_size / chunk_size + + body_outputs = tf.reshape(last_layer, [batch_size, extended_batch_batch_size, depth]) + + return target_modality.top(body_outputs) + ''' + # In the demo, we are doing a simple classification task on the entire + # segment. + # + # If you want to use the token-level output, use model.get_sequence_output() + # instead. + output_layer = model.get_pooled_output() + + hidden_size = output_layer.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) + + #logits = tf.matmul(output_layer, output_weights, transpose_b=True) + #logits = tf.nn.bias_add(logits, output_bias) + probabilities = tf.nn.softmax(logits, axis=-1) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + + return (loss, per_example_loss, logits, probabilities) + ''' + + +def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, + num_train_steps, num_warmup_steps, use_tpu, + use_one_hot_embeddings): + """Returns `model_fn` closure for TPUEstimator.""" + + def model_fn(features, labels, mode, params): # pylint: disable=unused-argument + """The `model_fn` for TPUEstimator.""" + + tf.logging.info("*** Features ***") + for name in sorted(features.keys()): + tf.logging.info(" name = %s, shape = %s" % (name, features[name].shape)) + + input_ids = features["input_ids"] + input_mask = features["input_mask"] + segment_ids = features["segment_ids"] + label_ids = features["label_ids"] + is_real_example = None + if "is_real_example" in features: + is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) + else: + is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + + is_training = (mode == tf.estimator.ModeKeys.TRAIN) + + #(total_loss, per_example_loss, logits, probabilities) = create_model( + #bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + #num_labels, use_one_hot_embeddings) + (total_loss, logits) = create_model( + bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, + num_labels, use_one_hot_embeddings) + + tvars = tf.trainable_variables() + initialized_variable_names = {} + scaffold_fn = None + if init_checkpoint: + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) + if use_tpu: + + def tpu_scaffold(): + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + return tf.train.Scaffold() + + scaffold_fn = tpu_scaffold + else: + tf.train.init_from_checkpoint(init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, + init_string) + + output_spec = None + if mode == tf.estimator.ModeKeys.TRAIN: + + train_op = optimization.create_optimizer( + total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, + None, FLAGS.use_fp16) + + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + train_op=train_op, + scaffold_fn=scaffold_fn) + elif mode == tf.estimator.ModeKeys.EVAL: + ''' + def metric_fn(per_example_loss, label_ids, logits, is_real_example): + predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) + accuracy = tf.metrics.accuracy( + labels=label_ids, predictions=predictions, weights=is_real_example) + loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) + return { + "eval_accuracy": accuracy, + "eval_loss": loss, + } + + eval_metrics = (metric_fn, + [per_example_loss, label_ids, logits, is_real_example]) + ''' + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + loss=total_loss, + #eval_metrics=eval_metrics, + eval_metrics=problem.eval_metrics, + scaffold_fn=scaffold_fn) + else: + output_spec = tf.contrib.tpu.TPUEstimatorSpec( + mode=mode, + predictions={"probabilities": probabilities}, + scaffold_fn=scaffold_fn) + return output_spec + + return model_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def input_fn_builder(features, seq_length, is_training, drop_remainder): + """Creates an `input_fn` closure to be passed to TPUEstimator.""" + + all_input_ids = [] + all_input_mask = [] + all_segment_ids = [] + all_label_ids = [] + + for feature in features: + all_input_ids.append(feature.input_ids) + all_input_mask.append(feature.input_mask) + all_segment_ids.append(feature.segment_ids) + all_label_ids.append(feature.label_id) + + def input_fn(params): + """The actual input function.""" + batch_size = params["batch_size"] + + num_examples = len(features) + + # This is for demo purposes and does NOT scale to large data sets. We do + # not use Dataset.from_generator() because that uses tf.py_func which is + # not TPU compatible. The right way to load data is with TFRecordReader. + d = tf.data.Dataset.from_tensor_slices({ + "input_ids": + tf.constant( + all_input_ids, shape=[num_examples, seq_length], + dtype=tf.int32), + "input_mask": + tf.constant( + all_input_mask, + shape=[num_examples, seq_length], + dtype=tf.int32), + "segment_ids": + tf.constant( + all_segment_ids, + shape=[num_examples, seq_length], + dtype=tf.int32), + "label_ids": + tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), + }) + + if is_training: + d = d.repeat() + d = d.shuffle(buffer_size=100) + + d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) + return d + + return input_fn + + +# This function is not used by this file but is still used by the Colab and +# people who depend on it. +def convert_examples_to_features(examples, label_list, max_seq_length, + tokenizer): + """Convert a set of `InputExample`s to a list of `InputFeatures`.""" + + features = [] + for (ex_index, example) in enumerate(examples): + if ex_index % 10000 == 0: + tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) + + feature = convert_single_example(ex_index, example, label_list, + max_seq_length, tokenizer) + + features.append(feature) + return features + + +def main(_): + tf.logging.set_verbosity(tf.logging.INFO) + + processors = { + "cola": ColaProcessor, + "mnli": MnliProcessor, + "mrpc": MrpcProcessor, + "xnli": XnliProcessor, + "dummy": DummyProcessor + } + + tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + FLAGS.init_checkpoint) + + if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: + raise ValueError( + "At least one of `do_train`, `do_eval` or `do_predict' must be True.") + + bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) + + if FLAGS.max_seq_length > bert_config.max_position_embeddings: + raise ValueError( + "Cannot use sequence length %d because the BERT model " + "was only trained up to sequence length %d" % + (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + + tf.gfile.MakeDirs(FLAGS.output_dir) + + #task_name = FLAGS.task_name.lower() + + task_name = 'dummy' + + if task_name not in processors: + raise ValueError("Task not found: %s" % (task_name)) + + processor = processors[task_name]() + + label_list = processor.get_labels() + + tokenizer = tokenization.FullTokenizer( + vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + + tpu_cluster_resolver = None + if FLAGS.use_tpu and FLAGS.tpu_name: + tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( + FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + + config = tf.ConfigProto() + if FLAGS.use_xla: + config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 + run_config = tf.contrib.tpu.RunConfig( + cluster=tpu_cluster_resolver, + master=FLAGS.master, + model_dir=FLAGS.output_dir, + session_config=config, + save_checkpoints_steps=FLAGS.save_checkpoints_steps, + tpu_config=tf.contrib.tpu.TPUConfig( + iterations_per_loop=FLAGS.iterations_per_loop, + num_shards=FLAGS.num_tpu_cores, + per_host_input_for_training=is_per_host)) + + train_examples = None + num_train_steps = None + num_warmup_steps = None + if FLAGS.do_train: + #train_examples = processor.get_train_examples(FLAGS.data_dir) + train_examples = processor.get_train_examples() + num_train_steps = int( + len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) + num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu) + + # If TPU is not available, this will fall back to normal Estimator on CPU + # or GPU. + from tensor2tensor.utils.trainer_lib import create_hparams, add_problem_hparams + import fathomt2t + from fathomt2t.common_flags import setup_dataset_flag + import fathomt2t.problems.fprecord_text_problem + print('FLAGS', FLAGS) + print('code mapping file', FLAGS.code_mapping_file) + problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints' + hparams_set = 'fh_transformer_tiny_multi_hints_4_layers_flat_lr_min_length' + setup_dataset_flag() + FLAGS.dataset_split = 'train' + hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name) + #problem = registry.problem(problem_name) + add_problem_hparams(hparams, problem_name) + target_modality = hparams.target_modality + problem = hparams.problem + + #from tensor2tensor.bin.t2t_trainer import create_run_config + #run_config = create_run_config(hparams) + estimator = tf.contrib.tpu.TPUEstimator( + #estimator = tf.estimator.Estimator( + use_tpu=FLAGS.use_tpu, + model_fn=model_fn, + config=run_config, + train_batch_size=FLAGS.train_batch_size, + eval_batch_size=FLAGS.eval_batch_size, + predict_batch_size=FLAGS.predict_batch_size) + + if FLAGS.do_train: + train_file = os.path.join(FLAGS.output_dir, "train.tf_record") + file_based_convert_examples_to_features( + train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) + tf.logging.info("***** Running training *****") + tf.logging.info(" Num examples = %d", len(train_examples)) + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) + ''' + train_input_fn = file_based_input_fn_builder( + input_file=train_file, + seq_length=FLAGS.max_seq_length, + is_training=True, + drop_remainder=True) + ''' + estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + + if FLAGS.do_eval: + #eval_examples = processor.get_dev_examples(FLAGS.data_dir) + eval_examples = processor.get_dev_examples() + num_actual_eval_examples = len(eval_examples) + if FLAGS.use_tpu: + # TPU requires a fixed batch size for all batches, therefore the number + # of examples must be a multiple of the batch size, or else examples + # will get dropped. So we pad with fake examples which are ignored + # later on. These do NOT count towards the metric (all tf.metrics + # support a per-instance weight, and these get a weight of 0.0). + while len(eval_examples) % FLAGS.eval_batch_size != 0: + eval_examples.append(PaddingInputExample()) + + eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") + file_based_convert_examples_to_features( + eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) + + tf.logging.info("***** Running evaluation *****") + tf.logging.info(" Num examples = %d (%d actual, %d padding)", + len(eval_examples), num_actual_eval_examples, + len(eval_examples) - num_actual_eval_examples) + tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) + + # This tells the estimator to run through the entire set. + eval_steps = None + # However, if running eval on the TPU, you will need to specify the + # number of steps. + if FLAGS.use_tpu: + assert len(eval_examples) % FLAGS.eval_batch_size == 0 + eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) + + eval_drop_remainder = True if FLAGS.use_tpu else False + + eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams) + ''' + eval_input_fn = file_based_input_fn_builder( + input_file=eval_file, + seq_length=FLAGS.max_seq_length, + is_training=False, + drop_remainder=eval_drop_remainder) + ''' + + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + + output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") + with tf.gfile.GFile(output_eval_file, "w") as writer: + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) + writer.write("%s = %s\n" % (key, str(result[key]))) + + +if __name__ == "__main__": + #flags.mark_flag_as_required("data_dir") + #flags.mark_flag_as_required("task_name") + flags.mark_flag_as_required("vocab_file") + flags.mark_flag_as_required("bert_config_file") + flags.mark_flag_as_required("output_dir") + tf.app.run() From 4750f0c146bb8b5f81dcab94db13ae4c835f3653 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Wed, 5 Jun 2019 00:05:05 +0000 Subject: [PATCH 14/77] clean up bert finetune --- .../LanguageModeling/BERT/finetune_BERT.py | 306 +++--------------- 1 file changed, 42 insertions(+), 264 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 0714d6a9b..2ffd48098 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -1,17 +1,3 @@ -# coding=utf-8 -# Copyright 2018 The Google AI Language Team Authors. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. """BERT finetuning runner.""" from __future__ import absolute_import @@ -127,7 +113,6 @@ flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") - class InputExample(object): """A single training/test example for simple sequence classification.""" @@ -208,176 +193,6 @@ def _read_tsv(cls, input_file, quotechar=None): return lines -class XnliProcessor(DataProcessor): - """Processor for the XNLI data set.""" - - def __init__(self): - self.language = "zh" - - def get_train_examples(self, data_dir): - """See base class.""" - lines = self._read_tsv( - os.path.join(data_dir, "multinli", - "multinli.train.%s.tsv" % self.language)) - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "train-%d" % (i) - text_a = tokenization.convert_to_unicode(line[0]) - text_b = tokenization.convert_to_unicode(line[1]) - label = tokenization.convert_to_unicode(line[2]) - if label == tokenization.convert_to_unicode("contradictory"): - label = tokenization.convert_to_unicode("contradiction") - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_dev_examples(self, data_dir): - """See base class.""" - lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv")) - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "dev-%d" % (i) - language = tokenization.convert_to_unicode(line[0]) - if language != tokenization.convert_to_unicode(self.language): - continue - text_a = tokenization.convert_to_unicode(line[6]) - text_b = tokenization.convert_to_unicode(line[7]) - label = tokenization.convert_to_unicode(line[1]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - -class MnliProcessor(DataProcessor): - """Processor for the MultiNLI data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")), - "dev_matched") - - def get_test_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test") - - def get_labels(self): - """See base class.""" - return ["contradiction", "entailment", "neutral"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0])) - text_a = tokenization.convert_to_unicode(line[8]) - text_b = tokenization.convert_to_unicode(line[9]) - if set_type == "test": - label = "contradiction" - else: - label = tokenization.convert_to_unicode(line[-1]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class MrpcProcessor(DataProcessor): - """Processor for the MRPC data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_test_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - if i == 0: - continue - guid = "%s-%s" % (set_type, i) - text_a = tokenization.convert_to_unicode(line[3]) - text_b = tokenization.convert_to_unicode(line[4]) - if set_type == "test": - label = "0" - else: - label = tokenization.convert_to_unicode(line[0]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label)) - return examples - - -class ColaProcessor(DataProcessor): - """Processor for the CoLA data set (GLUE version).""" - - def get_train_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "train.tsv")), "train") - - def get_dev_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev") - - def get_test_examples(self, data_dir): - """See base class.""" - return self._create_examples( - self._read_tsv(os.path.join(data_dir, "test.tsv")), "test") - - def get_labels(self): - """See base class.""" - return ["0", "1"] - - def _create_examples(self, lines, set_type): - """Creates examples for the training and dev sets.""" - examples = [] - for (i, line) in enumerate(lines): - # Only the test set has a header - if set_type == "test" and i == 0: - continue - guid = "%s-%s" % (set_type, i) - if set_type == "test": - text_a = tokenization.convert_to_unicode(line[1]) - label = "0" - else: - text_a = tokenization.convert_to_unicode(line[3]) - label = tokenization.convert_to_unicode(line[1]) - examples.append( - InputExample(guid=guid, text_a=text_a, text_b=None, label=label)) - return examples - - class DummyProcessor(DataProcessor): def get_train_examples(self): @@ -598,8 +413,11 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length): def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, - labels, num_labels, use_one_hot_embeddings): + labels, num_labels, use_one_hot_embeddings, hparams): """Creates a classification model.""" + target_modality = hparams.problem_hparams.target_modality + input_modality = hparams.problem_hparams.input_modality + model = modeling.BertModel( config=bert_config, is_training=is_training, @@ -609,54 +427,31 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32) - last_layer = model.get_sequence_output() - extended_batch_size = tf.shape(last_layer)[0] - chunk_size = tf.shape(last_layer)[1] - depth = tf.shape(last_layer)[2] - batch_size = extended_batch_size / chunk_size - - body_outputs = tf.reshape(last_layer, [batch_size, extended_batch_batch_size, depth]) - - return target_modality.top(body_outputs) - ''' - # In the demo, we are doing a simple classification task on the entire - # segment. - # - # If you want to use the token-level output, use model.get_sequence_output() - # instead. - output_layer = model.get_pooled_output() - - hidden_size = output_layer.shape[-1].value + # [B, 384, D] + body_outputs = model.get_sequence_output() + #extended_batch_size = tf.shape(body_outputs)[0] + #chunk_size = tf.shape(body_outputs)[1] + #depth = tf.shape(body_outputs)[2] + #batch_size = extended_batch_size / chunk_size - output_weights = tf.get_variable( - "output_weights", [num_labels, hidden_size], - initializer=tf.truncated_normal_initializer(stddev=0.02)) + #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) - output_bias = tf.get_variable( - "output_bias", [num_labels], initializer=tf.zeros_initializer()) - - with tf.variable_scope("loss"): - if is_training: - # I.e., 0.1 dropout - output_layer = tf.nn.dropout(output_layer, keep_prob=0.9) - - #logits = tf.matmul(output_layer, output_weights, transpose_b=True) - #logits = tf.nn.bias_add(logits, output_bias) - probabilities = tf.nn.softmax(logits, axis=-1) - log_probs = tf.nn.log_softmax(logits, axis=-1) - - one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) - - per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) - loss = tf.reduce_mean(per_example_loss) + body_outputs = tf.expand_dims(body_outputs, axis=-2) + features = { + 'targets': labels + } + labels = tf.reshape(labels, [tf.shape(labels)[0], 1, 1, 1]) + top_out = target_modality.top(body_outputs, features) + num, den = target_modality.loss(top_out, labels) + print('num, den', num, den) + loss = num / den - return (loss, per_example_loss, logits, probabilities) - ''' + return loss, top_out['logits'] def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, - use_one_hot_embeddings): + use_one_hot_embeddings, hparams): """Returns `model_fn` closure for TPUEstimator.""" def model_fn(features, labels, mode, params): # pylint: disable=unused-argument @@ -678,12 +473,9 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument is_training = (mode == tf.estimator.ModeKeys.TRAIN) - #(total_loss, per_example_loss, logits, probabilities) = create_model( - #bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, - #num_labels, use_one_hot_embeddings) (total_loss, logits) = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, - num_labels, use_one_hot_embeddings) + num_labels, use_one_hot_embeddings, hparams) tvars = tf.trainable_variables() initialized_variable_names = {} @@ -722,24 +514,9 @@ def tpu_scaffold(): train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: - ''' - def metric_fn(per_example_loss, label_ids, logits, is_real_example): - predictions = tf.argmax(logits, axis=-1, output_type=tf.int32) - accuracy = tf.metrics.accuracy( - labels=label_ids, predictions=predictions, weights=is_real_example) - loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example) - return { - "eval_accuracy": accuracy, - "eval_loss": loss, - } - - eval_metrics = (metric_fn, - [per_example_loss, label_ids, logits, is_real_example]) - ''' output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, - #eval_metrics=eval_metrics, eval_metrics=problem.eval_metrics, scaffold_fn=scaffold_fn) else: @@ -828,10 +605,6 @@ def main(_): tf.logging.set_verbosity(tf.logging.INFO) processors = { - "cola": ColaProcessor, - "mnli": MnliProcessor, - "mrpc": MrpcProcessor, - "xnli": XnliProcessor, "dummy": DummyProcessor } @@ -896,16 +669,6 @@ def main(_): len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) - model_fn = model_fn_builder( - bert_config=bert_config, - num_labels=len(label_list), - init_checkpoint=FLAGS.init_checkpoint, - learning_rate=FLAGS.learning_rate, - num_train_steps=num_train_steps, - num_warmup_steps=num_warmup_steps, - use_tpu=FLAGS.use_tpu, - use_one_hot_embeddings=FLAGS.use_tpu) - # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. from tensor2tensor.utils.trainer_lib import create_hparams, add_problem_hparams @@ -915,7 +678,7 @@ def main(_): print('FLAGS', FLAGS) print('code mapping file', FLAGS.code_mapping_file) problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints' - hparams_set = 'fh_transformer_tiny_multi_hints_4_layers_flat_lr_min_length' + hparams_set = 'finetune_bert' setup_dataset_flag() FLAGS.dataset_split = 'train' hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name) @@ -924,6 +687,21 @@ def main(_): target_modality = hparams.target_modality problem = hparams.problem + if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') + + model_fn = model_fn_builder( + bert_config=bert_config, + num_labels=len(label_list), + init_checkpoint=FLAGS.init_checkpoint, + learning_rate=FLAGS.learning_rate, + num_train_steps=num_train_steps, + num_warmup_steps=num_warmup_steps, + use_tpu=FLAGS.use_tpu, + use_one_hot_embeddings=FLAGS.use_tpu, + hparams=hparams) + #from tensor2tensor.bin.t2t_trainer import create_run_config #run_config = create_run_config(hparams) estimator = tf.contrib.tpu.TPUEstimator( @@ -943,14 +721,14 @@ def main(_): tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) - train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) - ''' + #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) + #''' train_input_fn = file_based_input_fn_builder( input_file=train_file, seq_length=FLAGS.max_seq_length, is_training=True, drop_remainder=True) - ''' + #''' estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) if FLAGS.do_eval: From f770029e872fd54fec9cce3dfde3778e77f189be Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 01:58:58 +0000 Subject: [PATCH 15/77] clean up and hvd fp16 --- .../LanguageModeling/BERT/finetune_BERT.py | 414 ++++-------------- 1 file changed, 87 insertions(+), 327 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 2ffd48098..f6a82a735 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -9,8 +9,9 @@ import os import modeling import optimization -import tokenization import tensorflow as tf +import horovod.tensorflow as hvd +import time flags = tf.flags @@ -36,6 +37,8 @@ "output_dir", None, "The output directory where the model checkpoints will be written.") +flags.DEFINE_string("tmp_dir", None, '') + ## Other parameters flags.DEFINE_string( @@ -68,6 +71,7 @@ flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.") flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") +flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs") flags.DEFINE_float("num_train_epochs", 3.0, "Total number of training epochs to perform.") @@ -113,6 +117,39 @@ flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") +# report samples/sec, total loss and learning rate during training +class _LogEvalRunHook(tf.train.SessionRunHook): + def __init__(self, global_batch_size, hvd_rank=-1): + self.global_batch_size = global_batch_size + self.hvd_rank = hvd_rank + self.total_time = 0.0 + self.count = 0 + + def before_run(self, run_context): + self.t0 = time.time() + + def after_run(self, run_context, run_values): + elapsed_secs = time.time() - self.t0 + self.total_time += elapsed_secs + self.count += 1 + +# report samples/sec, total loss and learning rate during training +class _LogTrainRunHook(tf.train.SessionRunHook): + def __init__(self, global_batch_size, hvd_rank=-1): + self.global_batch_size = global_batch_size + self.hvd_rank = hvd_rank + self.total_time = 0.0 + self.count = 0 + + def before_run(self, run_context): + self.t0 = time.time() + return tf.train.SessionRunArgs( + fetches=['step_update:0']) + def after_run(self, run_context, run_values): + elapsed_secs = time.time() - self.t0 + self.total_time += elapsed_secs + self.count += 1 + class InputExample(object): """A single training/test example for simple sequence classification.""" @@ -215,186 +252,6 @@ def _create_examples(self, num_lines, set_type): return examples -def convert_single_example(ex_index, example, label_list, max_seq_length, - tokenizer): - """Converts a single `InputExample` into a single `InputFeatures`.""" - - if isinstance(example, PaddingInputExample): - return InputFeatures( - input_ids=[0] * max_seq_length, - input_mask=[0] * max_seq_length, - segment_ids=[0] * max_seq_length, - label_id=0, - is_real_example=False) - - label_map = {} - for (i, label) in enumerate(label_list): - label_map[label] = i - - tokens_a = tokenizer.tokenize(example.text_a) - tokens_b = None - if example.text_b: - tokens_b = tokenizer.tokenize(example.text_b) - - if tokens_b: - # Modifies `tokens_a` and `tokens_b` in place so that the total - # length is less than the specified length. - # Account for [CLS], [SEP], [SEP] with "- 3" - _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3) - else: - # Account for [CLS] and [SEP] with "- 2" - if len(tokens_a) > max_seq_length - 2: - tokens_a = tokens_a[0:(max_seq_length - 2)] - - # The convention in BERT is: - # (a) For sequence pairs: - # tokens: [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP] - # type_ids: 0 0 0 0 0 0 0 0 1 1 1 1 1 1 - # (b) For single sequences: - # tokens: [CLS] the dog is hairy . [SEP] - # type_ids: 0 0 0 0 0 0 0 - # - # Where "type_ids" are used to indicate whether this is the first - # sequence or the second sequence. The embedding vectors for `type=0` and - # `type=1` were learned during pre-training and are added to the wordpiece - # embedding vector (and position vector). This is not *strictly* necessary - # since the [SEP] token unambiguously separates the sequences, but it makes - # it easier for the model to learn the concept of sequences. - # - # For classification tasks, the first vector (corresponding to [CLS]) is - # used as the "sentence vector". Note that this only makes sense because - # the entire model is fine-tuned. - tokens = [] - segment_ids = [] - tokens.append("[CLS]") - segment_ids.append(0) - for token in tokens_a: - tokens.append(token) - segment_ids.append(0) - tokens.append("[SEP]") - segment_ids.append(0) - - if tokens_b: - for token in tokens_b: - tokens.append(token) - segment_ids.append(1) - tokens.append("[SEP]") - segment_ids.append(1) - - input_ids = tokenizer.convert_tokens_to_ids(tokens) - - # The mask has 1 for real tokens and 0 for padding tokens. Only real - # tokens are attended to. - input_mask = [1] * len(input_ids) - - # Zero-pad up to the sequence length. - while len(input_ids) < max_seq_length: - input_ids.append(0) - input_mask.append(0) - segment_ids.append(0) - - assert len(input_ids) == max_seq_length - assert len(input_mask) == max_seq_length - assert len(segment_ids) == max_seq_length - - label_id = label_map[example.label] - if ex_index < 5: - tf.logging.info("*** Example ***") - tf.logging.info("guid: %s" % (example.guid)) - tf.logging.info("tokens: %s" % " ".join( - [tokenization.printable_text(x) for x in tokens])) - tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids])) - tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask])) - tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids])) - tf.logging.info("label: %s (id = %d)" % (example.label, label_id)) - - feature = InputFeatures( - input_ids=input_ids, - input_mask=input_mask, - segment_ids=segment_ids, - label_id=label_id, - is_real_example=True) - return feature - - -def file_based_convert_examples_to_features( - examples, label_list, max_seq_length, tokenizer, output_file): - """Convert a set of `InputExample`s to a TFRecord file.""" - - writer = tf.python_io.TFRecordWriter(output_file) - - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - feature = convert_single_example(ex_index, example, label_list, - max_seq_length, tokenizer) - - def create_int_feature(values): - f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values))) - return f - - features = collections.OrderedDict() - features["input_ids"] = create_int_feature(feature.input_ids) - features["input_mask"] = create_int_feature(feature.input_mask) - features["segment_ids"] = create_int_feature(feature.segment_ids) - features["label_ids"] = create_int_feature([feature.label_id]) - features["is_real_example"] = create_int_feature( - [int(feature.is_real_example)]) - - tf_example = tf.train.Example(features=tf.train.Features(feature=features)) - writer.write(tf_example.SerializeToString()) - writer.close() - - -def file_based_input_fn_builder(input_file, seq_length, is_training, - drop_remainder): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" - - name_to_features = { - "input_ids": tf.FixedLenFeature([seq_length], tf.int64), - "input_mask": tf.FixedLenFeature([seq_length], tf.int64), - "segment_ids": tf.FixedLenFeature([seq_length], tf.int64), - "label_ids": tf.FixedLenFeature([], tf.int64), - "is_real_example": tf.FixedLenFeature([], tf.int64), - } - - def _decode_record(record, name_to_features): - """Decodes a record to a TensorFlow example.""" - example = tf.parse_single_example(record, name_to_features) - - # tf.Example only supports tf.int64, but the TPU only supports tf.int32. - # So cast all int64 to int32. - for name in list(example.keys()): - t = example[name] - if t.dtype == tf.int64: - t = tf.to_int32(t) - example[name] = t - - return example - - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] - - # For training, we want a lot of parallel reading and shuffling. - # For eval, we want no shuffling and parallel reading doesn't matter. - d = tf.data.TFRecordDataset(input_file) - if is_training: - d = d.repeat() - d = d.shuffle(buffer_size=100) - - d = d.apply( - tf.contrib.data.map_and_batch( - lambda record: _decode_record(record, name_to_features), - batch_size=batch_size, - drop_remainder=drop_remainder)) - - return d - - return input_fn - - def _truncate_seq_pair(tokens_a, tokens_b, max_length): """Truncates a sequence pair in place to the maximum length.""" @@ -441,7 +298,9 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, 'targets': labels } labels = tf.reshape(labels, [tf.shape(labels)[0], 1, 1, 1]) + top_out = target_modality.top(body_outputs, features) + num, den = target_modality.loss(top_out, labels) print('num, den', num, den) loss = num / den @@ -451,7 +310,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, - use_one_hot_embeddings, hparams): + use_one_hot_embeddings, hparams, hvd=None, use_fp16=False): """Returns `model_fn` closure for TPUEstimator.""" def model_fn(features, labels, mode, params): # pylint: disable=unused-argument @@ -464,12 +323,7 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument input_ids = features["input_ids"] input_mask = features["input_mask"] segment_ids = features["segment_ids"] - label_ids = features["label_ids"] - is_real_example = None - if "is_real_example" in features: - is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32) - else: - is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32) + label_ids = features["targets"] is_training = (mode == tf.estimator.ModeKeys.TRAIN) @@ -480,7 +334,7 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument tvars = tf.trainable_variables() initialized_variable_names = {} scaffold_fn = None - if init_checkpoint: + if init_checkpoint and (hvd is None or hvd.rank() == 0): (assignment_map, initialized_variable_names ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) if use_tpu: @@ -498,15 +352,14 @@ def tpu_scaffold(): init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" name = %s, shape = %s%s", var.name, var.shape, - init_string) + tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, - None, FLAGS.use_fp16) + hvd, amp=use_fp16) output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, @@ -529,87 +382,18 @@ def tpu_scaffold(): return model_fn -# This function is not used by this file but is still used by the Colab and -# people who depend on it. -def input_fn_builder(features, seq_length, is_training, drop_remainder): - """Creates an `input_fn` closure to be passed to TPUEstimator.""" - - all_input_ids = [] - all_input_mask = [] - all_segment_ids = [] - all_label_ids = [] - - for feature in features: - all_input_ids.append(feature.input_ids) - all_input_mask.append(feature.input_mask) - all_segment_ids.append(feature.segment_ids) - all_label_ids.append(feature.label_id) - - def input_fn(params): - """The actual input function.""" - batch_size = params["batch_size"] - - num_examples = len(features) - - # This is for demo purposes and does NOT scale to large data sets. We do - # not use Dataset.from_generator() because that uses tf.py_func which is - # not TPU compatible. The right way to load data is with TFRecordReader. - d = tf.data.Dataset.from_tensor_slices({ - "input_ids": - tf.constant( - all_input_ids, shape=[num_examples, seq_length], - dtype=tf.int32), - "input_mask": - tf.constant( - all_input_mask, - shape=[num_examples, seq_length], - dtype=tf.int32), - "segment_ids": - tf.constant( - all_segment_ids, - shape=[num_examples, seq_length], - dtype=tf.int32), - "label_ids": - tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32), - }) - - if is_training: - d = d.repeat() - d = d.shuffle(buffer_size=100) - - d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder) - return d - - return input_fn - - -# This function is not used by this file but is still used by the Colab and -# people who depend on it. -def convert_examples_to_features(examples, label_list, max_seq_length, - tokenizer): - """Convert a set of `InputExample`s to a list of `InputFeatures`.""" - - features = [] - for (ex_index, example) in enumerate(examples): - if ex_index % 10000 == 0: - tf.logging.info("Writing example %d of %d" % (ex_index, len(examples))) - - feature = convert_single_example(ex_index, example, label_list, - max_seq_length, tokenizer) - - features.append(feature) - return features - - def main(_): tf.logging.set_verbosity(tf.logging.INFO) + if FLAGS.horovod: + hvd.init() + processors = { "dummy": DummyProcessor } - tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, - FLAGS.init_checkpoint) + #tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, + #FLAGS.init_checkpoint) if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( @@ -636,14 +420,27 @@ def main(_): label_list = processor.get_labels() - tokenizer = tokenization.FullTokenizer( - vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case) + training_hooks = [] + global_batch_size = FLAGS.train_batch_size + hvd_rank = 0 tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + if FLAGS.horovod: + tf.logging.info("Multi-GPU training with TF Horovod") + tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) + global_batch_size = FLAGS.train_batch_size * hvd.size() + learning_rate = learning_rate * hvd.size() + master_process = (hvd.rank() == 0) + hvd_rank = hvd.rank() + config.gpu_options.allow_growth = True + config.gpu_options.visible_device_list = str(hvd.local_rank()) + if hvd.size() > 1: + training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) + config = tf.ConfigProto() if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 @@ -654,6 +451,7 @@ def main(_): model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps, + log_step_count_steps=1, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, @@ -663,11 +461,8 @@ def main(_): num_train_steps = None num_warmup_steps = None if FLAGS.do_train: - #train_examples = processor.get_train_examples(FLAGS.data_dir) - train_examples = processor.get_train_examples() - num_train_steps = int( - len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs) - num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion) + num_train_steps = 1000 + num_warmup_steps = 1 # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. @@ -677,19 +472,24 @@ def main(_): import fathomt2t.problems.fprecord_text_problem print('FLAGS', FLAGS) print('code mapping file', FLAGS.code_mapping_file) - problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints' + #problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints' + problem_name = 'bert_problem' hparams_set = 'finetune_bert' setup_dataset_flag() FLAGS.dataset_split = 'train' + if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') + hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name) #problem = registry.problem(problem_name) add_problem_hparams(hparams, problem_name) target_modality = hparams.target_modality problem = hparams.problem - if FLAGS.use_fp16: - os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - print('Turning on AMP') + hparams.data_dir = '/usr/src/bert/scratch_data_dir' + ## INGEST + problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir) model_fn = model_fn_builder( bert_config=bert_config, @@ -700,7 +500,9 @@ def main(_): num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, - hparams=hparams) + hparams=hparams, + hvd=None if not FLAGS.horovod else hvd, + use_fp16=FLAGS.use_fp16) #from tensor2tensor.bin.t2t_trainer import create_run_config #run_config = create_run_config(hparams) @@ -714,65 +516,23 @@ def main(_): predict_batch_size=FLAGS.predict_batch_size) if FLAGS.do_train: - train_file = os.path.join(FLAGS.output_dir, "train.tf_record") - file_based_convert_examples_to_features( - train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file) tf.logging.info("***** Running training *****") - tf.logging.info(" Num examples = %d", len(train_examples)) tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) - #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) - #''' - train_input_fn = file_based_input_fn_builder( - input_file=train_file, - seq_length=FLAGS.max_seq_length, - is_training=True, - drop_remainder=True) - #''' - estimator.train(input_fn=train_input_fn, max_steps=num_train_steps) + train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) + training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) + estimator.train( + input_fn=train_input_fn, + hooks=training_hooks, + max_steps=num_train_steps) if FLAGS.do_eval: - #eval_examples = processor.get_dev_examples(FLAGS.data_dir) - eval_examples = processor.get_dev_examples() - num_actual_eval_examples = len(eval_examples) - if FLAGS.use_tpu: - # TPU requires a fixed batch size for all batches, therefore the number - # of examples must be a multiple of the batch size, or else examples - # will get dropped. So we pad with fake examples which are ignored - # later on. These do NOT count towards the metric (all tf.metrics - # support a per-instance weight, and these get a weight of 0.0). - while len(eval_examples) % FLAGS.eval_batch_size != 0: - eval_examples.append(PaddingInputExample()) - - eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record") - file_based_convert_examples_to_features( - eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file) - tf.logging.info("***** Running evaluation *****") - tf.logging.info(" Num examples = %d (%d actual, %d padding)", - len(eval_examples), num_actual_eval_examples, - len(eval_examples) - num_actual_eval_examples) tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) - # This tells the estimator to run through the entire set. - eval_steps = None - # However, if running eval on the TPU, you will need to specify the - # number of steps. - if FLAGS.use_tpu: - assert len(eval_examples) % FLAGS.eval_batch_size == 0 - eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size) - - eval_drop_remainder = True if FLAGS.use_tpu else False - eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams) - ''' - eval_input_fn = file_based_input_fn_builder( - input_file=eval_file, - seq_length=FLAGS.max_seq_length, - is_training=False, - drop_remainder=eval_drop_remainder) - ''' + eval_steps = 1000 result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") From 7f14f2cd4bb2a9af57650d1c9b99b4b9e669b7f3 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 18:34:13 +0000 Subject: [PATCH 16/77] hparams data dir --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index f6a82a735..ce11522bf 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -487,7 +487,7 @@ def main(_): target_modality = hparams.target_modality problem = hparams.problem - hparams.data_dir = '/usr/src/bert/scratch_data_dir' + hparams.data_dir = 'gs://fathom-dev-210618-workspace-h5bnpfec/FINETUNE_BERT/2019-06-06T07-57-40.391508+00-00/usr.src.bert.finetune_BERT.py/data_dir' ## INGEST problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir) From 09f7c025091bf0452150d1f98620cf453abf84c7 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 18:48:54 +0000 Subject: [PATCH 17/77] clean up and move amp --- .../LanguageModeling/BERT/finetune_BERT.py | 125 +----------------- 1 file changed, 4 insertions(+), 121 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index ce11522bf..d1f721086 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -117,6 +117,10 @@ flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") +if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') + # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): def __init__(self, global_batch_size, hvd_rank=-1): @@ -150,124 +154,6 @@ def after_run(self, run_context, run_values): self.total_time += elapsed_secs self.count += 1 -class InputExample(object): - """A single training/test example for simple sequence classification.""" - - def __init__(self, guid, text_a, text_b=None, label=None): - """Constructs a InputExample. - - Args: - guid: Unique id for the example. - text_a: string. The untokenized text of the first sequence. For single - sequence tasks, only this sequence must be specified. - text_b: (Optional) string. The untokenized text of the second sequence. - Only must be specified for sequence pair tasks. - label: (Optional) string. The label of the example. This should be - specified for train and dev examples, but not for test examples. - """ - self.guid = guid - self.text_a = text_a - self.text_b = text_b - self.label = label - - -class PaddingInputExample(object): - """Fake example so the num input examples is a multiple of the batch size. - - When running eval/predict on the TPU, we need to pad the number of examples - to be a multiple of the batch size, because the TPU requires a fixed batch - size. The alternative is to drop the last batch, which is bad because it means - the entire output data won't be generated. - - We use this class instead of `None` because treating `None` as padding - battches could cause silent errors. - """ - - -class InputFeatures(object): - """A single set of features of data.""" - - def __init__(self, - input_ids, - input_mask, - segment_ids, - label_id, - is_real_example=True): - self.input_ids = input_ids - self.input_mask = input_mask - self.segment_ids = segment_ids - self.label_id = label_id - self.is_real_example = is_real_example - - -class DataProcessor(object): - """Base class for data converters for sequence classification data sets.""" - - def get_train_examples(self, data_dir): - """Gets a collection of `InputExample`s for the train set.""" - raise NotImplementedError() - - def get_dev_examples(self, data_dir): - """Gets a collection of `InputExample`s for the dev set.""" - raise NotImplementedError() - - def get_test_examples(self, data_dir): - """Gets a collection of `InputExample`s for prediction.""" - raise NotImplementedError() - - def get_labels(self): - """Gets the list of labels for this data set.""" - raise NotImplementedError() - - @classmethod - def _read_tsv(cls, input_file, quotechar=None): - """Reads a tab separated value file.""" - with tf.gfile.Open(input_file, "r") as f: - reader = csv.reader(f, delimiter="\t", quotechar=quotechar) - lines = [] - for line in reader: - lines.append(line) - return lines - - -class DummyProcessor(DataProcessor): - - def get_train_examples(self): - return self._create_examples(10, 'train') - - def get_dev_examples(self): - return self._create_examples(2, 'dev') - - def get_test_examples(self): - return self._create_examples(2, 'test') - - def get_labels(self): - return ["0", "1"] - - def _create_examples(self, num_lines, set_type): - examples = [] - for i in range(num_lines): - guid = "%s-%d" % (set_type, i) - examples.append(InputExample(guid=guid, text_a='dummy dummy, dummy', label='0')) - return examples - - -def _truncate_seq_pair(tokens_a, tokens_b, max_length): - """Truncates a sequence pair in place to the maximum length.""" - - # This is a simple heuristic which will always truncate the longer sequence - # one token at a time. This makes more sense than truncating an equal percent - # of tokens from each, since if one sequence is very short then each token - # that's truncated likely contains more information than a longer sequence. - while True: - total_length = len(tokens_a) + len(tokens_b) - if total_length <= max_length: - break - if len(tokens_a) > len(tokens_b): - tokens_a.pop() - else: - tokens_b.pop() - def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, hparams): @@ -477,9 +363,6 @@ def main(_): hparams_set = 'finetune_bert' setup_dataset_flag() FLAGS.dataset_split = 'train' - if FLAGS.use_fp16: - os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - print('Turning on AMP') hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name) #problem = registry.problem(problem_name) From ac331abadea67eb491a2b1a562fe610cb5cf24ff Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 18:55:06 +0000 Subject: [PATCH 18/77] clean --- .../LanguageModeling/BERT/finetune_BERT.py | 27 ++++--------------- 1 file changed, 5 insertions(+), 22 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index d1f721086..6856b939f 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -117,9 +117,6 @@ flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.") -if FLAGS.use_fp16: - os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" - print('Turning on AMP') # report samples/sec, total loss and learning rate during training class _LogEvalRunHook(tf.train.SessionRunHook): @@ -271,16 +268,13 @@ def tpu_scaffold(): def main(_): tf.logging.set_verbosity(tf.logging.INFO) + if FLAGS.use_fp16: + os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" + print('Turning on AMP') + if FLAGS.horovod: hvd.init() - processors = { - "dummy": DummyProcessor - } - - #tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case, - #FLAGS.init_checkpoint) - if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: raise ValueError( "At least one of `do_train`, `do_eval` or `do_predict' must be True.") @@ -295,17 +289,6 @@ def main(_): tf.gfile.MakeDirs(FLAGS.output_dir) - #task_name = FLAGS.task_name.lower() - - task_name = 'dummy' - - if task_name not in processors: - raise ValueError("Task not found: %s" % (task_name)) - - processor = processors[task_name]() - - label_list = processor.get_labels() - training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 @@ -376,7 +359,7 @@ def main(_): model_fn = model_fn_builder( bert_config=bert_config, - num_labels=len(label_list), + num_labels=10, init_checkpoint=FLAGS.init_checkpoint, learning_rate=FLAGS.learning_rate, num_train_steps=num_train_steps, From a29bf2b6f71ee509a6b501ae3ae34b444b0e90ce Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 20:28:47 +0000 Subject: [PATCH 19/77] use classifier last layer --- .../LanguageModeling/BERT/finetune_BERT.py | 27 +++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 6856b939f..d794f9629 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -175,7 +175,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, #batch_size = extended_batch_size / chunk_size #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) - + ''' body_outputs = tf.expand_dims(body_outputs, axis=-2) features = { 'targets': labels @@ -185,10 +185,33 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, top_out = target_modality.top(body_outputs, features) num, den = target_modality.loss(top_out, labels) - print('num, den', num, den) loss = num / den return loss, top_out['logits'] + ''' + hidden_size = body_outputs.shape[-1].value + + output_weights = tf.get_variable( + "output_weights", [num_labels, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "output_bias", [num_labels], initializer=tf.zeros_initializer()) + + with tf.variable_scope("loss"): + if is_training: + # I.e., 0.1 dropout + body_outputs = tf.nn.dropout(body_outputs, keep_prob=0.9) + + logits = tf.matmul(body_outputs, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + log_probs = tf.nn.log_softmax(logits, axis=-1) + + one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) + + per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + loss = tf.reduce_mean(per_example_loss) + return loss, logits def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, From adcda89694eaa9d7fed9ed606e15f0321ba0fea4 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 20:35:27 +0000 Subject: [PATCH 20/77] fix --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index d794f9629..f0131c124 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -189,6 +189,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, return loss, top_out['logits'] ''' + body_outputs = model.get_pooled_output() hidden_size = body_outputs.shape[-1].value output_weights = tf.get_variable( From a7e761e069ed6a48651d174703657825694a04ef Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 20:51:28 +0000 Subject: [PATCH 21/77] our top --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index f0131c124..14e819f29 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -175,7 +175,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, #batch_size = extended_batch_size / chunk_size #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) - ''' + #''' body_outputs = tf.expand_dims(body_outputs, axis=-2) features = { 'targets': labels @@ -213,6 +213,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return loss, logits + ''' def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, From 07c500cec7d37add95d4b350be396991c4864b9a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 21:26:02 +0000 Subject: [PATCH 22/77] lr --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 14e819f29..fca7122c8 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -323,6 +323,7 @@ def main(_): tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + learning_rate = FLAGS.learning_rate if FLAGS.horovod: tf.logging.info("Multi-GPU training with TF Horovod") tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) @@ -386,7 +387,7 @@ def main(_): bert_config=bert_config, num_labels=10, init_checkpoint=FLAGS.init_checkpoint, - learning_rate=FLAGS.learning_rate, + learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, From 0215da099a4f62c8d47bed750ee24952ef6e434e Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 21:29:07 +0000 Subject: [PATCH 23/77] init config earlier --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index fca7122c8..8957bd96f 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -323,6 +323,7 @@ def main(_): tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver( FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project) + config = tf.ConfigProto() learning_rate = FLAGS.learning_rate if FLAGS.horovod: tf.logging.info("Multi-GPU training with TF Horovod") @@ -336,7 +337,6 @@ def main(_): if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) - config = tf.ConfigProto() if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 From 65443d0981a7849c8b79c9d2ec26121d9391cd57 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 22:37:41 +0000 Subject: [PATCH 24/77] skip generate tfproto --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 8957bd96f..1f8e5b15b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -381,7 +381,7 @@ def main(_): hparams.data_dir = 'gs://fathom-dev-210618-workspace-h5bnpfec/FINETUNE_BERT/2019-06-06T07-57-40.391508+00-00/usr.src.bert.finetune_BERT.py/data_dir' ## INGEST - problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir) + #problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir) model_fn = model_fn_builder( bert_config=bert_config, From 246b465d3415fe926923c64deb79a976dcb5caf7 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 6 Jun 2019 23:57:10 +0000 Subject: [PATCH 25/77] squad --- .../LanguageModeling/BERT/finetune_BERT.py | 48 ++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 1f8e5b15b..351aeaae6 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -175,7 +175,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, #batch_size = extended_batch_size / chunk_size #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) - #''' + ''' body_outputs = tf.expand_dims(body_outputs, axis=-2) features = { 'targets': labels @@ -189,6 +189,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, return loss, top_out['logits'] ''' + # classifier body_outputs = model.get_pooled_output() hidden_size = body_outputs.shape[-1].value @@ -214,7 +215,52 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, loss = tf.reduce_mean(per_example_loss) return loss, logits ''' + #''' + # squad + final_hidden = model.get_sequence_output() + + final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) + batch_size = final_hidden_shape[0] + seq_length = final_hidden_shape[1] + hidden_size = final_hidden_shape[2] + + output_weights = tf.get_variable( + "cls/squad/output_weights", [2, hidden_size], + initializer=tf.truncated_normal_initializer(stddev=0.02)) + + output_bias = tf.get_variable( + "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) + + final_hidden_matrix = tf.reshape(final_hidden, + [batch_size * seq_length, hidden_size]) + logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) + logits = tf.nn.bias_add(logits, output_bias) + logits = tf.reshape(logits, [batch_size, seq_length, 2]) + logits = tf.transpose(logits, [2, 0, 1]) + + unstacked_logits = tf.unstack(logits, axis=0) + + (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = [label_ids] + end_positions = [label_ids] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_lotis, end_positions) + + total_loss = (start_loss + end_loss) / 2.0 + + return total_loss, start_logits + #''' def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, From 62fa7e846ed3d03c114e1ae15057843bee41267c Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 02:07:29 +0000 Subject: [PATCH 26/77] seq length and comment --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 351aeaae6..c1940b853 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -188,7 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, loss = num / den return loss, top_out['logits'] - ''' + # classifier body_outputs = model.get_pooled_output() hidden_size = body_outputs.shape[-1].value @@ -243,6 +243,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + seq_length = modeling.get_shape_list(input_ids)[1] def compute_loss(logits, positions): one_hot_positions = tf.one_hot( positions, depth=seq_length, dtype=tf.float32) From e774acc02cb030dd1ea8d27d05d267d8d9a768ab Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 02:19:36 +0000 Subject: [PATCH 27/77] typo --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index c1940b853..eca542000 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -256,7 +256,7 @@ def compute_loss(logits, positions): end_positions = [label_ids] start_loss = compute_loss(start_logits, start_positions) - end_loss = compute_loss(end_lotis, end_positions) + end_loss = compute_loss(end_logits, end_positions) total_loss = (start_loss + end_loss) / 2.0 From a905407de4aa9d91b7b2d6c9313dd83d57111ffa Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 02:22:26 +0000 Subject: [PATCH 28/77] typo --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index eca542000..531c790d9 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -252,8 +252,8 @@ def compute_loss(logits, positions): tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss - start_positions = [label_ids] - end_positions = [label_ids] + start_positions = [labels] + end_positions = [labels] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) From 1cc0d47bcde9952578eb39430d8c67b7d1930b11 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 02:35:04 +0000 Subject: [PATCH 29/77] compute type 32 --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 531c790d9..8218ad031 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -165,7 +165,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, input_mask=input_mask, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, - compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32) + compute_type=tf.float32) # [B, 384, D] body_outputs = model.get_sequence_output() From 77f789a2bca2968ee7bb4366f2924cd16201b251 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 03:00:39 +0000 Subject: [PATCH 30/77] move loss outside --- .../LanguageModeling/BERT/finetune_BERT.py | 45 ++++++++++--------- 1 file changed, 24 insertions(+), 21 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 8218ad031..4e32c632b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -166,6 +166,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float32) + ''' # [B, 384, D] body_outputs = model.get_sequence_output() @@ -175,7 +176,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, #batch_size = extended_batch_size / chunk_size #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) - ''' body_outputs = tf.expand_dims(body_outputs, axis=-2) features = { 'targets': labels @@ -242,25 +242,8 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, unstacked_logits = tf.unstack(logits, axis=0) (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) + return start_logits, end_logits - seq_length = modeling.get_shape_list(input_ids)[1] - def compute_loss(logits, positions): - one_hot_positions = tf.one_hot( - positions, depth=seq_length, dtype=tf.float32) - log_probs = tf.nn.log_softmax(logits, axis=-1) - loss = -tf.reduce_mean( - tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) - return loss - - start_positions = [labels] - end_positions = [labels] - - start_loss = compute_loss(start_logits, start_positions) - end_loss = compute_loss(end_logits, end_positions) - - total_loss = (start_loss + end_loss) / 2.0 - - return total_loss, start_logits #''' def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, @@ -282,7 +265,8 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument is_training = (mode == tf.estimator.ModeKeys.TRAIN) - (total_loss, logits) = create_model( + #(total_loss, logits) = create_model( + (start_logits, end_logits) = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, hparams) @@ -311,7 +295,25 @@ def tpu_scaffold(): output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: + #### + seq_length = modeling.get_shape_list(input_ids)[1] + def compute_loss(logits, positions): + one_hot_positions = tf.one_hot( + positions, depth=seq_length, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits, axis=-1) + loss = -tf.reduce_mean( + tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) + return loss + + start_positions = [labels] + end_positions = [labels] + + start_loss = compute_loss(start_logits, start_positions) + end_loss = compute_loss(end_logits, end_positions) + total_loss = (start_loss + end_loss) / 2.0 + + ### train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, hvd, amp=use_fp16) @@ -384,8 +386,9 @@ def main(_): if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) - if FLAGS.use_xla: + if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 + is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2 run_config = tf.contrib.tpu.RunConfig( cluster=tpu_cluster_resolver, From 2841acc222b1130d81bd38fa0126e9f693c01f36 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 03:13:15 +0000 Subject: [PATCH 31/77] barrier --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 4e32c632b..8044aba0e 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -463,6 +463,11 @@ def main(_): tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) + if FLAGS.horovod: + barrier.hvd.allreduce(tf.constant(0)) + with tf.Session(config=config) as sess: + sess.run(barrier) + estimator.train( input_fn=train_input_fn, hooks=training_hooks, From ee9cbd2e757714f8281f0fee820ccfedd5ba235d Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 03:17:12 +0000 Subject: [PATCH 32/77] typo --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 8044aba0e..8a568866e 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -464,7 +464,7 @@ def main(_): train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.horovod: - barrier.hvd.allreduce(tf.constant(0)) + barrier = hvd.allreduce(tf.constant(0)) with tf.Session(config=config) as sess: sess.run(barrier) From c49424699279e8ddea5aff8971cf22e7d2f12ec0 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 18:40:05 +0000 Subject: [PATCH 33/77] hvd input fn builder --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 8a568866e..ceab7ae7f 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -461,7 +461,9 @@ def main(_): tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) - train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) + #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) + train_input_fn = problem.horovod_input_fn_builder( + is_training=True, hvd=None if not FLAGS.horovode else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) From 644583301de45a8ebe542fc5ffb0e224257b1fde Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 18:58:41 +0000 Subject: [PATCH 34/77] typo --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index ceab7ae7f..cefa8c4d1 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -463,7 +463,7 @@ def main(_): tf.logging.info(" Num steps = %d", num_train_steps) #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) train_input_fn = problem.horovod_input_fn_builder( - is_training=True, hvd=None if not FLAGS.horovode else hvd) + is_training=True, hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) From 420a0af778a50baf417636b3cfd27b8f194877f0 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 20:27:39 +0000 Subject: [PATCH 35/77] try mock start positions --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index cefa8c4d1..dccb51eaa 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -305,8 +305,7 @@ def compute_loss(logits, positions): tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss - start_positions = [labels] - end_positions = [labels] + start_positions = end_positions = tf.ones_like([tf.shape(start_logits)[0], 1]) start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) From a82b153854822260c11ed6f7beb534cddbed4bc1 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 20:37:20 +0000 Subject: [PATCH 36/77] our top --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index dccb51eaa..cd872acbe 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -166,7 +166,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float32) - ''' + #''' # [B, 384, D] body_outputs = model.get_sequence_output() @@ -188,7 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, loss = num / den return loss, top_out['logits'] - + ''' # classifier body_outputs = model.get_pooled_output() hidden_size = body_outputs.shape[-1].value @@ -214,8 +214,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) loss = tf.reduce_mean(per_example_loss) return loss, logits - ''' - #''' + # squad final_hidden = model.get_sequence_output() @@ -305,7 +304,8 @@ def compute_loss(logits, positions): tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) return loss - start_positions = end_positions = tf.ones_like([tf.shape(start_logits)[0], 1]) + start_positions = [labels] + end_positions = [labels] start_loss = compute_loss(start_logits, start_positions) end_loss = compute_loss(end_logits, end_positions) From f75aafadb4ac00e9aec89fd0d348e78c848cf759 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 20:41:10 +0000 Subject: [PATCH 37/77] clean up --- .../LanguageModeling/BERT/finetune_BERT.py | 74 ------------------- 1 file changed, 74 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index cd872acbe..1a8d842dc 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -188,62 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, loss = num / den return loss, top_out['logits'] - ''' - # classifier - body_outputs = model.get_pooled_output() - hidden_size = body_outputs.shape[-1].value - output_weights = tf.get_variable( - "output_weights", [num_labels, hidden_size], - initializer=tf.truncated_normal_initializer(stddev=0.02)) - - output_bias = tf.get_variable( - "output_bias", [num_labels], initializer=tf.zeros_initializer()) - - with tf.variable_scope("loss"): - if is_training: - # I.e., 0.1 dropout - body_outputs = tf.nn.dropout(body_outputs, keep_prob=0.9) - - logits = tf.matmul(body_outputs, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - log_probs = tf.nn.log_softmax(logits, axis=-1) - - one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32) - - per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) - loss = tf.reduce_mean(per_example_loss) - return loss, logits - - # squad - final_hidden = model.get_sequence_output() - - final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3) - batch_size = final_hidden_shape[0] - seq_length = final_hidden_shape[1] - hidden_size = final_hidden_shape[2] - - output_weights = tf.get_variable( - "cls/squad/output_weights", [2, hidden_size], - initializer=tf.truncated_normal_initializer(stddev=0.02)) - - output_bias = tf.get_variable( - "cls/squad/output_bias", [2], initializer=tf.zeros_initializer()) - - final_hidden_matrix = tf.reshape(final_hidden, - [batch_size * seq_length, hidden_size]) - logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True) - logits = tf.nn.bias_add(logits, output_bias) - - logits = tf.reshape(logits, [batch_size, seq_length, 2]) - logits = tf.transpose(logits, [2, 0, 1]) - - unstacked_logits = tf.unstack(logits, axis=0) - - (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1]) - return start_logits, end_logits - - #''' def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, @@ -294,25 +239,6 @@ def tpu_scaffold(): output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: - #### - seq_length = modeling.get_shape_list(input_ids)[1] - def compute_loss(logits, positions): - one_hot_positions = tf.one_hot( - positions, depth=seq_length, dtype=tf.float32) - log_probs = tf.nn.log_softmax(logits, axis=-1) - loss = -tf.reduce_mean( - tf.reduce_sum(one_hot_positions * log_probs, axis=-1)) - return loss - - start_positions = [labels] - end_positions = [labels] - - start_loss = compute_loss(start_logits, start_positions) - end_loss = compute_loss(end_logits, end_positions) - - total_loss = (start_loss + end_loss) / 2.0 - - ### train_op = optimization.create_optimizer( total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu, hvd, amp=use_fp16) From c5385b1ffb025794d287a819ecbae4bc67e1ccb7 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 21:01:19 +0000 Subject: [PATCH 38/77] loss --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 1a8d842dc..0c35f4ef5 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -209,8 +209,7 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument is_training = (mode == tf.estimator.ModeKeys.TRAIN) - #(total_loss, logits) = create_model( - (start_logits, end_logits) = create_model( + (total_loss, logits) = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, num_labels, use_one_hot_embeddings, hparams) From c4f8e0a17fab568209ffcc0426a1fd28f7b82aae Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 21:27:51 +0000 Subject: [PATCH 39/77] hparams --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 0c35f4ef5..18a802502 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -387,7 +387,8 @@ def main(_): tf.logging.info(" Num steps = %d", num_train_steps) #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) train_input_fn = problem.horovod_input_fn_builder( - is_training=True, hvd=None if not FLAGS.horovod else hvd) + mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, + hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) From 1ed1c76e6da257a71d8e37ecebb08644fa6ab41a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 21:36:26 +0000 Subject: [PATCH 40/77] hvd for partitioning --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 18a802502..d5f5a7058 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -385,10 +385,11 @@ def main(_): tf.logging.info("***** Running training *****") tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) tf.logging.info(" Num steps = %d", num_train_steps) - #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams) - train_input_fn = problem.horovod_input_fn_builder( - mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, - hvd=None if not FLAGS.horovod else hvd) + train_input_fn = problem.make_estimator_input_fn( + tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) + #train_input_fn = problem.horovod_input_fn_builder( + #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, + #hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) From 0f95ae655bf11cd38cfee639c38ca6edfd80a27a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 7 Jun 2019 22:29:59 +0000 Subject: [PATCH 41/77] flags data dir --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index d5f5a7058..30613dde7 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -353,7 +353,7 @@ def main(_): target_modality = hparams.target_modality problem = hparams.problem - hparams.data_dir = 'gs://fathom-dev-210618-workspace-h5bnpfec/FINETUNE_BERT/2019-06-06T07-57-40.391508+00-00/usr.src.bert.finetune_BERT.py/data_dir' + hparams.data_dir = FLAGS.data_dir ## INGEST #problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir) From 0e28db1e36b6d7e956f0d1f70b77339b302633b0 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 8 Jun 2019 04:13:24 +0000 Subject: [PATCH 42/77] print out ini --- TensorFlow/LanguageModeling/BERT/run_squad.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py index ba45d923f..f5a2c56d5 100644 --- a/TensorFlow/LanguageModeling/BERT/run_squad.py +++ b/TensorFlow/LanguageModeling/BERT/run_squad.py @@ -679,14 +679,14 @@ def tpu_scaffold(): else: tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - if FLAGS.verbose_logging: - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = "" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, - init_string) + #if FLAGS.verbose_logging: + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, + init_string) output_spec = None From fe8b8253a05798f02d15977724919eefc92396cb Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 8 Jun 2019 04:17:17 +0000 Subject: [PATCH 43/77] oom hook --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 30613dde7..2f9db1150 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -391,6 +391,14 @@ def main(_): #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, #hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) + + class OomReportingHook(tf.train.SessionRunHook): + def before_run(self, run_context): + return tf.train.SessionRunArgs(fetches=[], # no extra fetches + options=tf.RunOptions( + report_tensor_allocations_upon_oom=True)) + + training_hooks.append(OomReportingHook) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) with tf.Session(config=config) as sess: From df3c976e6caf64661428edac32e1ae2ead772846 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 8 Jun 2019 06:18:51 +0000 Subject: [PATCH 44/77] move out --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 15 ++++++++------- 1 file changed, 8 insertions(+), 7 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 2f9db1150..c966d7e6b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -152,6 +152,13 @@ def after_run(self, run_context, run_values): self.count += 1 +class _OomReportingHook(tf.train.SessionRunHook): + def before_run(self, run_context): + return tf.train.SessionRunArgs(fetches=[], # no extra fetches + options=tf.RunOptions( + report_tensor_allocations_upon_oom=True)) + + def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, num_labels, use_one_hot_embeddings, hparams): """Creates a classification model.""" @@ -392,13 +399,7 @@ def main(_): #hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) - class OomReportingHook(tf.train.SessionRunHook): - def before_run(self, run_context): - return tf.train.SessionRunArgs(fetches=[], # no extra fetches - options=tf.RunOptions( - report_tensor_allocations_upon_oom=True)) - - training_hooks.append(OomReportingHook) + training_hooks.append(_OomReportingHook) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) with tf.Session(config=config) as sess: From d50367d2f7ee7aeaff295f92d76c596dcfd75e64 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 8 Jun 2019 07:20:33 +0000 Subject: [PATCH 45/77] instantiate --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index c966d7e6b..891298eb9 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -399,7 +399,7 @@ def main(_): #hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) - training_hooks.append(_OomReportingHook) + training_hooks.append(_OomReportingHook()) if FLAGS.horovod: barrier = hvd.allreduce(tf.constant(0)) with tf.Session(config=config) as sess: From 8501f1487f67cf058024d08dba5b5eafd7cc23aa Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 8 Jun 2019 08:02:33 +0000 Subject: [PATCH 46/77] else --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 891298eb9..7432620d1 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -276,6 +276,8 @@ def main(_): if FLAGS.use_fp16: os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1" print('Turning on AMP') + else: + print('NOT Turning on AMP') if FLAGS.horovod: hvd.init() From 09fc5fd9edf10f235774d719ffd3236f8e2bfe92 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Wed, 12 Jun 2019 21:04:34 +0000 Subject: [PATCH 47/77] fix --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 7432620d1..f07923262 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -173,7 +173,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, token_type_ids=segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float32) - #''' # [B, 384, D] body_outputs = model.get_sequence_output() @@ -184,12 +183,10 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) body_outputs = tf.expand_dims(body_outputs, axis=-2) - features = { - 'targets': labels - } - labels = tf.reshape(labels, [tf.shape(labels)[0], 1, 1, 1]) - top_out = target_modality.top(body_outputs, features) + top_out = target_modality.top(body_outputs, None) + + #labels = tf.expand_dims(tf.expand_dims(labels, axis=-1), axis=-1) num, den = target_modality.loss(top_out, labels) loss = num / den From 3aa196e0c9d00a5b97c6704630ab4f779db02c1a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 13 Jun 2019 02:32:35 +0000 Subject: [PATCH 48/77] eval train loop --- .../LanguageModeling/BERT/finetune_BERT.py | 106 +++++++++--------- 1 file changed, 51 insertions(+), 55 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index f07923262..7a24c44d3 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -56,10 +56,6 @@ "Sequences longer than this will be truncated, and sequences shorter " "than this will be padded.") -flags.DEFINE_bool("do_train", False, "Whether to run training.") - -flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.") - flags.DEFINE_bool( "do_predict", False, "Whether to run the model in inference mode on the test set.") @@ -186,8 +182,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, top_out = target_modality.top(body_outputs, None) - #labels = tf.expand_dims(tf.expand_dims(labels, axis=-1), axis=-1) - num, den = target_modality.loss(top_out, labels) loss = num / den @@ -196,7 +190,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, - use_one_hot_embeddings, hparams, hvd=None, use_fp16=False): + use_one_hot_embeddings, hparams, problem, hvd=None, use_fp16=False): """Returns `model_fn` closure for TPUEstimator.""" def model_fn(features, labels, mode, params): # pylint: disable=unused-argument @@ -252,10 +246,15 @@ def tpu_scaffold(): train_op=train_op, scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: + #logits.update({'labels': labels}) + eval_metrics = lambda logits, labels: { + name: call(logits, labels) + for name, call in problem.all_metrics_fns.items() + if name in problem.eval_metrics()} output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, - eval_metrics=problem.eval_metrics, + eval_metrics=(eval_metrics, [logits, labels]), scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( @@ -279,10 +278,6 @@ def main(_): if FLAGS.horovod: hvd.init() - if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict: - raise ValueError( - "At least one of `do_train`, `do_eval` or `do_predict' must be True.") - bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file) if FLAGS.max_seq_length > bert_config.max_position_embeddings: @@ -333,11 +328,12 @@ def main(_): per_host_input_for_training=is_per_host)) train_examples = None - num_train_steps = None - num_warmup_steps = None - if FLAGS.do_train: - num_train_steps = 1000 - num_warmup_steps = 1 + num_train_steps = 2000 + num_warmup_steps = 1 + eval_frequency_steps = 100 + assert num_train_steps % eval_frequency_steps == 0 + train_eval_iterations = num_train_steps // eval_frequency_steps + eval_steps = 100 # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. @@ -365,7 +361,7 @@ def main(_): model_fn = model_fn_builder( bert_config=bert_config, - num_labels=10, + num_labels=problem.label_manager, init_checkpoint=FLAGS.init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, @@ -373,6 +369,7 @@ def main(_): use_tpu=FLAGS.use_tpu, use_one_hot_embeddings=FLAGS.use_tpu, hparams=hparams, + problem=problem, hvd=None if not FLAGS.horovod else hvd, use_fp16=FLAGS.use_fp16) @@ -387,43 +384,42 @@ def main(_): eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) - if FLAGS.do_train: - tf.logging.info("***** Running training *****") - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - tf.logging.info(" Num steps = %d", num_train_steps) - train_input_fn = problem.make_estimator_input_fn( - tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) - #train_input_fn = problem.horovod_input_fn_builder( - #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, - #hvd=None if not FLAGS.horovod else hvd) - training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) - - training_hooks.append(_OomReportingHook()) - if FLAGS.horovod: - barrier = hvd.allreduce(tf.constant(0)) - with tf.Session(config=config) as sess: - sess.run(barrier) - - estimator.train( - input_fn=train_input_fn, - hooks=training_hooks, - max_steps=num_train_steps) - - if FLAGS.do_eval: - tf.logging.info("***** Running evaluation *****") - tf.logging.info(" Batch size = %d", FLAGS.eval_batch_size) - - eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams) - - eval_steps = 1000 - result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) - - output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt") - with tf.gfile.GFile(output_eval_file, "w") as writer: - tf.logging.info("***** Eval results *****") - for key in sorted(result.keys()): - tf.logging.info(" %s = %s", key, str(result[key])) - writer.write("%s = %s\n" % (key, str(result[key]))) + tf.logging.info("***** Running training *****") + tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + tf.logging.info(" Num steps = %d", num_train_steps) + train_input_fn = problem.make_estimator_input_fn( + tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) + #train_input_fn = problem.horovod_input_fn_builder( + #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, + #hvd=None if not FLAGS.horovod else hvd) + training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) + + #training_hooks.append(_OomReportingHook()) + + eval_input_fn = problem.make_estimator_input_fn( + tf.estimator.ModeKeys.EVAL, + hparams, + None if not FLAGS.horovod else hvd) + + if FLAGS.horovod: + barrier = hvd.allreduce(tf.constant(0)) + with tf.Session(config=config) as sess: + sess.run(barrier) + + # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 + for n in range(train_eval_iterations): + if not FLAGS.horovod or hvd.rank() != 0: + estimator.train( + input_fn=train_input_fn, + hooks=training_hooks, + # TODO: LR dependent on train steps, are we resetting this every time then? + steps=num_train_steps) + + if not FLAGS.horovod or hvd.rank() == 0: + result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + tf.logging.info("***** Eval results *****") + for key in sorted(result.keys()): + tf.logging.info(" %s = %s", key, str(result[key])) if __name__ == "__main__": From 081a1a720c0c35e313dc923ee82c981ef8457567 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 13 Jun 2019 02:33:51 +0000 Subject: [PATCH 49/77] do not need num labels --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 7a24c44d3..ca8d17e6b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -156,7 +156,7 @@ def before_run(self, run_context): def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, - labels, num_labels, use_one_hot_embeddings, hparams): + labels, use_one_hot_embeddings, hparams): """Creates a classification model.""" target_modality = hparams.problem_hparams.target_modality input_modality = hparams.problem_hparams.input_modality @@ -188,7 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, return loss, top_out['logits'] -def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate, +def model_fn_builder(bert_config, init_checkpoint, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings, hparams, problem, hvd=None, use_fp16=False): """Returns `model_fn` closure for TPUEstimator.""" @@ -209,7 +209,7 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument (total_loss, logits) = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, - num_labels, use_one_hot_embeddings, hparams) + use_one_hot_embeddings, hparams) tvars = tf.trainable_variables() initialized_variable_names = {} @@ -361,7 +361,6 @@ def main(_): model_fn = model_fn_builder( bert_config=bert_config, - num_labels=problem.label_manager, init_checkpoint=FLAGS.init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, From 9ce757ded4bec88623bed32919787f41c3bdbc91 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 13 Jun 2019 19:27:33 +0000 Subject: [PATCH 50/77] barrier between train and eval --- .../LanguageModeling/BERT/finetune_BERT.py | 33 ++++++++++--------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index ca8d17e6b..e8b0d1335 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -383,9 +383,10 @@ def main(_): eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) - tf.logging.info("***** Running training *****") - tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - tf.logging.info(" Num steps = %d", num_train_steps) + tf.logging.info("***** Running training *****", + hvd.rank() if FLAGS.horovod else 'no hvd', ) + #tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) + #tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) #train_input_fn = problem.horovod_input_fn_builder( @@ -400,22 +401,24 @@ def main(_): hparams, None if not FLAGS.horovod else hvd) - if FLAGS.horovod: - barrier = hvd.allreduce(tf.constant(0)) - with tf.Session(config=config) as sess: - sess.run(barrier) - # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 for n in range(train_eval_iterations): - if not FLAGS.horovod or hvd.rank() != 0: - estimator.train( - input_fn=train_input_fn, - hooks=training_hooks, - # TODO: LR dependent on train steps, are we resetting this every time then? - steps=num_train_steps) + estimator.train( + input_fn=train_input_fn, + hooks=training_hooks, + # TODO: LR dependent on train steps, are we resetting this every time then? + steps=num_train_steps) + + if FLAGS.horovod: + barrier = hvd.allreduce(tf.constant(0)) + with tf.Session(config=config) as sess: + sess.run(barrier) if not FLAGS.horovod or hvd.rank() == 0: - result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps) + result = estimator.evaluate( + input_fn=eval_input_fn, + steps=eval_steps, + hooks=[_LogEvalRunHook() tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) From 7bac20c0ed891d771006e8a49a1225de601ac29b Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 13 Jun 2019 22:43:53 +0000 Subject: [PATCH 51/77] fix --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index e8b0d1335..0c0d4c63d 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -418,7 +418,7 @@ def main(_): result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, - hooks=[_LogEvalRunHook() + hooks=[_LogEvalRunHook(FLAGS.eval_batch_size)]) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) From 7050721b2d33504b1961e98d45764fc3c498fde3 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 13 Jun 2019 23:06:41 +0000 Subject: [PATCH 52/77] use master process --- .../LanguageModeling/BERT/finetune_BERT.py | 20 +++++++------------ 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 0c0d4c63d..380d12b47 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -288,6 +288,7 @@ def main(_): tf.gfile.MakeDirs(FLAGS.output_dir) + master_process = True training_hooks = [] global_batch_size = FLAGS.train_batch_size hvd_rank = 0 @@ -383,15 +384,8 @@ def main(_): eval_batch_size=FLAGS.eval_batch_size, predict_batch_size=FLAGS.predict_batch_size) - tf.logging.info("***** Running training *****", - hvd.rank() if FLAGS.horovod else 'no hvd', ) - #tf.logging.info(" Batch size = %d", FLAGS.train_batch_size) - #tf.logging.info(" Num steps = %d", num_train_steps) train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) - #train_input_fn = problem.horovod_input_fn_builder( - #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams, - #hvd=None if not FLAGS.horovod else hvd) training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) #training_hooks.append(_OomReportingHook()) @@ -403,18 +397,18 @@ def main(_): # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 for n in range(train_eval_iterations): + if master_process: + tf.logging.info("***** Running training *****", + hvd.rank() if FLAGS.horovod else 'no hvd') + # TODO: verify we are not reloading bert every time estimator.train( input_fn=train_input_fn, hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? steps=num_train_steps) - if FLAGS.horovod: - barrier = hvd.allreduce(tf.constant(0)) - with tf.Session(config=config) as sess: - sess.run(barrier) - - if not FLAGS.horovod or hvd.rank() == 0: + if master_process: + tf.logging.info("***** Running eval *****") result = estimator.evaluate( input_fn=eval_input_fn, steps=eval_steps, From 7f6fa4af12afb4581e95635477ab22bcbcc27bb7 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 14 Jun 2019 00:07:56 +0000 Subject: [PATCH 53/77] clean up print --- .../LanguageModeling/BERT/finetune_BERT.py | 111 +++++++++++------- 1 file changed, 66 insertions(+), 45 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 380d12b47..c0879528d 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -45,11 +45,6 @@ "init_checkpoint", None, "Initial checkpoint (usually from a pre-trained BERT model).") -flags.DEFINE_bool( - "do_lower_case", True, - "Whether to lower case the input text. Should be True for uncased " - "models and False for cased models.") - flags.DEFINE_integer( "max_seq_length", 128, "The maximum total input sequence length after WordPiece tokenization. " @@ -73,9 +68,8 @@ "Total number of training epochs to perform.") flags.DEFINE_float( - "warmup_proportion", 0.1, - "Proportion of training to perform linear learning rate warmup for. " - "E.g., 0.1 = 10% of training.") + "warmup_steps", 10, + "Number of training steps to perform linear learning rate warmup for. ") flags.DEFINE_integer("save_checkpoints_steps", 1000, "How often to save the model checkpoint.") @@ -115,37 +109,57 @@ # report samples/sec, total loss and learning rate during training -class _LogEvalRunHook(tf.train.SessionRunHook): - def __init__(self, global_batch_size, hvd_rank=-1): +class _LogSessionRunHook(tf.train.SessionRunHook): + def __init__(self, global_batch_size, display_every=10, hvd_rank=-1): self.global_batch_size = global_batch_size + self.display_every = display_every self.hvd_rank = hvd_rank - self.total_time = 0.0 - self.count = 0 - - def before_run(self, run_context): - self.t0 = time.time() - - def after_run(self, run_context, run_values): - elapsed_secs = time.time() - self.t0 - self.total_time += elapsed_secs - self.count += 1 - -# report samples/sec, total loss and learning rate during training -class _LogTrainRunHook(tf.train.SessionRunHook): - def __init__(self, global_batch_size, hvd_rank=-1): - self.global_batch_size = global_batch_size - self.hvd_rank = hvd_rank - self.total_time = 0.0 + def after_create_session(self, session, coord): + if FLAGS.use_fp16: + print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate Loss-scaler') + else: + print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate') + self.elapsed_secs = 0. self.count = 0 - def before_run(self, run_context): self.t0 = time.time() - return tf.train.SessionRunArgs( - fetches=['step_update:0']) + if FLAGS.use_fp16: + return tf.train.SessionRunArgs( + fetches=['step_update:0', 'total_loss:0', + 'learning_rate:0', 'nsp_loss:0', + 'mlm_loss:0', 'loss_scale:0']) + else: + return tf.train.SessionRunArgs( + fetches=['step_update:0', 'total_loss:0', + 'learning_rate:0', 'nsp_loss:0', + 'mlm_loss:0']) def after_run(self, run_context, run_values): - elapsed_secs = time.time() - self.t0 - self.total_time += elapsed_secs + self.elapsed_secs += time.time() - self.t0 self.count += 1 + if FLAGS.use_fp16: + global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results + else: + global_step, total_loss, lr, nsp_loss, mlm_loss = run_values.results + print_step = global_step + 1 # One-based index for printing. + if print_step == 1 or print_step % self.display_every == 0: + dt = self.elapsed_secs / self.count + img_per_sec = self.global_batch_size / dt + if self.hvd_rank >= 0: + if FLAGS.use_fp16: + print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f %6.4e %6.4e' % + (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler)) + else: + print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f %6.4e' % + (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr)) + else: + if FLAGS.use_fp16: + print('%6i %11.1f %10.4e %10.4e %6.3f %6.4e %6.4e' % + (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler)) + else: + print('%6i %11.1f %10.4e %10.4e %6.3f %6.4e' % + (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr)) + self.elapsed_secs = 0. + self.count = 0 class _OomReportingHook(tf.train.SessionRunHook): @@ -286,11 +300,23 @@ def main(_): "was only trained up to sequence length %d" % (FLAGS.max_seq_length, bert_config.max_position_embeddings)) + # train config + global_batch_size = FLAGS.train_batch_size + # max train steps + num_train_steps = 1e7 + num_warmup_steps = FLAGS.warmup_steps + eval_steps = 100 + eval_frequency_steps = 100 + + if FLAGS.horovod: + num_train_steps //= hvd.size() + num_warmup_steps //= hvd.size() + tf.gfile.MakeDirs(FLAGS.output_dir) master_process = True training_hooks = [] - global_batch_size = FLAGS.train_batch_size + hvd_rank = 0 tpu_cluster_resolver = None @@ -312,6 +338,9 @@ def main(_): if hvd.size() > 1: training_hooks.append(hvd.BroadcastGlobalVariablesHook(0)) + num_train_steps //= hvd.size() + num_warmup_steps //= hvd.size() + if FLAGS.use_xla: config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1 @@ -328,14 +357,6 @@ def main(_): num_shards=FLAGS.num_tpu_cores, per_host_input_for_training=is_per_host)) - train_examples = None - num_train_steps = 2000 - num_warmup_steps = 1 - eval_frequency_steps = 100 - assert num_train_steps % eval_frequency_steps == 0 - train_eval_iterations = num_train_steps // eval_frequency_steps - eval_steps = 100 - # If TPU is not available, this will fall back to normal Estimator on CPU # or GPU. from tensor2tensor.utils.trainer_lib import create_hparams, add_problem_hparams @@ -386,7 +407,7 @@ def main(_): train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) - training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank)) + training_hooks.append(_LogSessionRunHook(global_batch_size, 10, -1 if not FLAGS.horovod else hvd_rank)) #training_hooks.append(_OomReportingHook()) @@ -396,7 +417,8 @@ def main(_): None if not FLAGS.horovod else hvd) # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 - for n in range(train_eval_iterations): + # TODO: replace with ValidationMonitor and EarlyStoppingHook + for i in range(2): if master_process: tf.logging.info("***** Running training *****", hvd.rank() if FLAGS.horovod else 'no hvd') @@ -411,8 +433,7 @@ def main(_): tf.logging.info("***** Running eval *****") result = estimator.evaluate( input_fn=eval_input_fn, - steps=eval_steps, - hooks=[_LogEvalRunHook(FLAGS.eval_batch_size)]) + steps=eval_steps) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) From 5c97155f7d6fabaa7f5e0d8b6a7e2b7716fb59f5 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 14 Jun 2019 01:41:00 +0000 Subject: [PATCH 54/77] fix hooks --- .../LanguageModeling/BERT/finetune_BERT.py | 63 +++++++++---------- 1 file changed, 29 insertions(+), 34 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index c0879528d..b8f1c5876 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -64,10 +64,7 @@ flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs") -flags.DEFINE_float("num_train_epochs", 3.0, - "Total number of training epochs to perform.") - -flags.DEFINE_float( +flags.DEFINE_integer( "warmup_steps", 10, "Number of training steps to perform linear learning rate warmup for. ") @@ -114,50 +111,51 @@ def __init__(self, global_batch_size, display_every=10, hvd_rank=-1): self.global_batch_size = global_batch_size self.display_every = display_every self.hvd_rank = hvd_rank + def after_create_session(self, session, coord): - if FLAGS.use_fp16: - print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate Loss-scaler') - else: - print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate') + if self.hvd_rank <= 0: + if FLAGS.use_fp16: + print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate Loss-scaler') + else: + print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate') self.elapsed_secs = 0. self.count = 0 + def before_run(self, run_context): self.t0 = time.time() if FLAGS.use_fp16: return tf.train.SessionRunArgs( fetches=['step_update:0', 'total_loss:0', - 'learning_rate:0', 'nsp_loss:0', - 'mlm_loss:0', 'loss_scale:0']) + 'learning_rate:0', 'loss_scale:0']) else: return tf.train.SessionRunArgs( - fetches=['step_update:0', 'total_loss:0', - 'learning_rate:0', 'nsp_loss:0', - 'mlm_loss:0']) + fetches=['step_update:0', 'total_loss:0', 'learning_rate:0']) + def after_run(self, run_context, run_values): self.elapsed_secs += time.time() - self.t0 self.count += 1 if FLAGS.use_fp16: - global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results + global_step, total_loss, lr, loss_scaler = run_values.results else: - global_step, total_loss, lr, nsp_loss, mlm_loss = run_values.results + global_step, total_loss, lr = run_values.results print_step = global_step + 1 # One-based index for printing. if print_step == 1 or print_step % self.display_every == 0: dt = self.elapsed_secs / self.count img_per_sec = self.global_batch_size / dt if self.hvd_rank >= 0: if FLAGS.use_fp16: - print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f %6.4e %6.4e' % - (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler)) + print('%2d :: %6i %11.1f %6.3f %6.4e %6.4e' % + (self.hvd_rank, print_step, img_per_sec, total_loss, lr, loss_scaler)) else: - print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f %6.4e' % - (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr)) + print('%2d :: %6i %11.1f %10.4e %6.3f %6.4e' % + (self.hvd_rank, print_step, img_per_sec, mlm_loss, total_loss, lr)) else: if FLAGS.use_fp16: - print('%6i %11.1f %10.4e %10.4e %6.3f %6.4e %6.4e' % - (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler)) + print('%6i %11.1f %6.3f %6.4e %6.4e' % + (print_step, img_per_sec, total_loss, lr, loss_scaler)) else: - print('%6i %11.1f %10.4e %10.4e %6.3f %6.4e' % - (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr)) + print('%6i %11.1f %6.3f %6.4e' % + (print_step, img_per_sec, total_loss, lr)) self.elapsed_secs = 0. self.count = 0 @@ -224,6 +222,8 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument (total_loss, logits) = create_model( bert_config, is_training, input_ids, input_mask, segment_ids, label_ids, use_one_hot_embeddings, hparams) + # for logging hook to pick up + total_loss = tf.identity(total_loss, name='total_loss') tvars = tf.trainable_variables() initialized_variable_names = {} @@ -308,16 +308,12 @@ def main(_): eval_steps = 100 eval_frequency_steps = 100 - if FLAGS.horovod: - num_train_steps //= hvd.size() - num_warmup_steps //= hvd.size() - tf.gfile.MakeDirs(FLAGS.output_dir) master_process = True training_hooks = [] - hvd_rank = 0 + hvd_rank = -1 tpu_cluster_resolver = None if FLAGS.use_tpu and FLAGS.tpu_name: @@ -331,8 +327,8 @@ def main(_): tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank()) global_batch_size = FLAGS.train_batch_size * hvd.size() learning_rate = learning_rate * hvd.size() - master_process = (hvd.rank() == 0) hvd_rank = hvd.rank() + master_process = (hvd_rank == 0) config.gpu_options.allow_growth = True config.gpu_options.visible_device_list = str(hvd.local_rank()) if hvd.size() > 1: @@ -350,7 +346,7 @@ def main(_): master=FLAGS.master, model_dir=FLAGS.output_dir, session_config=config, - save_checkpoints_steps=FLAGS.save_checkpoints_steps, + save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, log_step_count_steps=1, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, @@ -407,7 +403,7 @@ def main(_): train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) - training_hooks.append(_LogSessionRunHook(global_batch_size, 10, -1 if not FLAGS.horovod else hvd_rank)) + training_hooks.append(_LogSessionRunHook(global_batch_size, 10, hvd_rank)) #training_hooks.append(_OomReportingHook()) @@ -420,14 +416,13 @@ def main(_): # TODO: replace with ValidationMonitor and EarlyStoppingHook for i in range(2): if master_process: - tf.logging.info("***** Running training *****", - hvd.rank() if FLAGS.horovod else 'no hvd') + tf.logging.info("***** Running training ***** " + str(hvd_rank)) # TODO: verify we are not reloading bert every time estimator.train( input_fn=train_input_fn, hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? - steps=num_train_steps) + steps=eval_frequency_steps) if master_process: tf.logging.info("***** Running eval *****") From 5a5ee89720ff06c153f6dbace1b8a175a9919b56 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 14 Jun 2019 02:00:19 +0000 Subject: [PATCH 55/77] clean --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index b8f1c5876..a8433c643 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -115,9 +115,9 @@ def __init__(self, global_batch_size, display_every=10, hvd_rank=-1): def after_create_session(self, session, coord): if self.hvd_rank <= 0: if FLAGS.use_fp16: - print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate Loss-scaler') + print(' Step samples/sec Loss Learning-rate Loss-scaler') else: - print(' Step samples/sec MLM Loss NSP Loss Loss Learning-rate') + print(' Step samples/sec Loss Learning-rate') self.elapsed_secs = 0. self.count = 0 @@ -147,8 +147,8 @@ def after_run(self, run_context, run_values): print('%2d :: %6i %11.1f %6.3f %6.4e %6.4e' % (self.hvd_rank, print_step, img_per_sec, total_loss, lr, loss_scaler)) else: - print('%2d :: %6i %11.1f %10.4e %6.3f %6.4e' % - (self.hvd_rank, print_step, img_per_sec, mlm_loss, total_loss, lr)) + print('%2d :: %6i %11.1f %6.3f %6.4e' % + (self.hvd_rank, print_step, img_per_sec, total_loss, lr)) else: if FLAGS.use_fp16: print('%6i %11.1f %6.3f %6.4e %6.4e' % From b028fc7d0f0a4ee28ed3d1741fc112ceb32ce3ce Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 14 Jun 2019 02:25:21 +0000 Subject: [PATCH 56/77] no eval steps --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index a8433c643..b38d7576f 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -305,7 +305,6 @@ def main(_): # max train steps num_train_steps = 1e7 num_warmup_steps = FLAGS.warmup_steps - eval_steps = 100 eval_frequency_steps = 100 tf.gfile.MakeDirs(FLAGS.output_dir) @@ -426,9 +425,7 @@ def main(_): if master_process: tf.logging.info("***** Running eval *****") - result = estimator.evaluate( - input_fn=eval_input_fn, - steps=eval_steps) + result = estimator.evaluate(input_fn=eval_input_fn) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) From dd63f694be78a57924576ae8a12cf22c656f1cae Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 15 Jun 2019 00:25:05 +0000 Subject: [PATCH 57/77] fix logging --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index b38d7576f..90bb0868e 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -64,6 +64,10 @@ flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.") flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs") +flags.DEFINE_integer( + "eval_frequency_steps", 10, + "Number of training steps per gpu between evals.") + flags.DEFINE_integer( "warmup_steps", 10, "Number of training steps to perform linear learning rate warmup for. ") @@ -305,7 +309,7 @@ def main(_): # max train steps num_train_steps = 1e7 num_warmup_steps = FLAGS.warmup_steps - eval_frequency_steps = 100 + eval_frequency_steps = FLAGS.eval_frequency_steps tf.gfile.MakeDirs(FLAGS.output_dir) @@ -346,7 +350,7 @@ def main(_): model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, - log_step_count_steps=1, + log_step_count_steps=100000000000, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, num_shards=FLAGS.num_tpu_cores, @@ -402,7 +406,7 @@ def main(_): train_input_fn = problem.make_estimator_input_fn( tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd) - training_hooks.append(_LogSessionRunHook(global_batch_size, 10, hvd_rank)) + training_hooks.append(_LogSessionRunHook(global_batch_size, 100, hvd_rank)) #training_hooks.append(_OomReportingHook()) From 6d9fd1b3750038601f2042723e95a04d788b319a Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 15 Jun 2019 00:52:16 +0000 Subject: [PATCH 58/77] jsut eval --- .../LanguageModeling/BERT/finetune_BERT.py | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 90bb0868e..ca2a49e7b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -265,14 +265,16 @@ def tpu_scaffold(): scaffold_fn=scaffold_fn) elif mode == tf.estimator.ModeKeys.EVAL: #logits.update({'labels': labels}) - eval_metrics = lambda logits, labels: { - name: call(logits, labels) - for name, call in problem.all_metrics_fns.items() - if name in problem.eval_metrics()} + def metric_fn(logits, labels): + return { + name: call(logits, labels) + for name, call in problem.all_metrics_fns.items() + if name in problem.eval_metrics()} + output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, - eval_metrics=(eval_metrics, [logits, labels]), + eval_metrics=(metric_fn, [logits, labels]), scaffold_fn=scaffold_fn) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( @@ -419,13 +421,15 @@ def main(_): # TODO: replace with ValidationMonitor and EarlyStoppingHook for i in range(2): if master_process: - tf.logging.info("***** Running training ***** " + str(hvd_rank)) + tf.logging.info("***** Running training *****") # TODO: verify we are not reloading bert every time + ''' estimator.train( input_fn=train_input_fn, hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? steps=eval_frequency_steps) + ''' if master_process: tf.logging.info("***** Running eval *****") From a973d8d51ccb2fa0d9895b0b2169eecb3bd108bd Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sat, 15 Jun 2019 01:03:24 +0000 Subject: [PATCH 59/77] steps = None --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index ca2a49e7b..84e430e0b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -433,7 +433,7 @@ def main(_): if master_process: tf.logging.info("***** Running eval *****") - result = estimator.evaluate(input_fn=eval_input_fn) + result = estimator.evaluate(input_fn=eval_input_fn, steps=None) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) From 834adecb8c4a92622d2ce2ace04bec1ce89bbdc3 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sun, 16 Jun 2019 07:52:57 +0000 Subject: [PATCH 60/77] update_op as second --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 84e430e0b..981bc441f 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -266,8 +266,13 @@ def tpu_scaffold(): elif mode == tf.estimator.ModeKeys.EVAL: #logits.update({'labels': labels}) def metric_fn(logits, labels): + + def get_update_op(_metric_fn, logits, labels): + update_op, _ = _metric_fn(logits, labels) + return tf.constant(0.0), update_op + return { - name: call(logits, labels) + name: get_update_op(call, logits, labels) for name, call in problem.all_metrics_fns.items() if name in problem.eval_metrics()} From 63b00459706c1fd75a3162a527469601a701c404 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sun, 16 Jun 2019 08:55:34 +0000 Subject: [PATCH 61/77] put train back --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 981bc441f..ed3362f5c 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -424,17 +424,15 @@ def main(_): # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 # TODO: replace with ValidationMonitor and EarlyStoppingHook - for i in range(2): + for i in range(10): if master_process: tf.logging.info("***** Running training *****") # TODO: verify we are not reloading bert every time - ''' estimator.train( input_fn=train_input_fn, hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? steps=eval_frequency_steps) - ''' if master_process: tf.logging.info("***** Running eval *****") From 927ec70e7c43f6a15647c41dc6db8dfac5a9194e Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sun, 16 Jun 2019 10:30:01 +0000 Subject: [PATCH 62/77] only init on first loop --- .../LanguageModeling/BERT/finetune_BERT.py | 67 +++++++++++-------- 1 file changed, 38 insertions(+), 29 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index ed3362f5c..41cb851ba 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -171,6 +171,31 @@ def before_run(self, run_context): report_tensor_allocations_upon_oom=True)) +class InitBertHook(tf.train.SessionRunHook): + def __init__(self, initialize_bert, init_checkpoint, hvd = None): + self._initialize_bert = initialize_bert + self._init_checkpoint = init_checkpoint + self._hvd = hvd + + def begin(self): + if not self._initialize_bert: + return + + tvars = tf.trainable_variables() + initialized_variable_names = {} + if self._init_checkpoint and (self._hvd is None or self._hvd.rank() == 0): + (assignment_map, initialized_variable_names + ) = modeling.get_assignment_map_from_checkpoint(tvars, self._init_checkpoint) + tf.train.init_from_checkpoint(self._init_checkpoint, assignment_map) + + tf.logging.info("**** Trainable Variables ****") + for var in tvars: + init_string = "" + if var.name in initialized_variable_names: + init_string = ", *INIT_FROM_CKPT*" + tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) + + def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, labels, use_one_hot_embeddings, hparams): """Creates a classification model.""" @@ -204,7 +229,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, return loss, top_out['logits'] -def model_fn_builder(bert_config, init_checkpoint, learning_rate, +def model_fn_builder(bert_config, learning_rate, num_train_steps, num_warmup_steps, use_tpu, use_one_hot_embeddings, hparams, problem, hvd=None, use_fp16=False): """Returns `model_fn` closure for TPUEstimator.""" @@ -229,29 +254,6 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument # for logging hook to pick up total_loss = tf.identity(total_loss, name='total_loss') - tvars = tf.trainable_variables() - initialized_variable_names = {} - scaffold_fn = None - if init_checkpoint and (hvd is None or hvd.rank() == 0): - (assignment_map, initialized_variable_names - ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint) - if use_tpu: - - def tpu_scaffold(): - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - return tf.train.Scaffold() - - scaffold_fn = tpu_scaffold - else: - tf.train.init_from_checkpoint(init_checkpoint, assignment_map) - - tf.logging.info("**** Trainable Variables ****") - for var in tvars: - init_string = "" - if var.name in initialized_variable_names: - init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) - output_spec = None if mode == tf.estimator.ModeKeys.TRAIN: train_op = optimization.create_optimizer( @@ -262,7 +264,7 @@ def tpu_scaffold(): mode=mode, loss=total_loss, train_op=train_op, - scaffold_fn=scaffold_fn) + scaffold_fn=None) elif mode == tf.estimator.ModeKeys.EVAL: #logits.update({'labels': labels}) def metric_fn(logits, labels): @@ -280,12 +282,12 @@ def get_update_op(_metric_fn, logits, labels): mode=mode, loss=total_loss, eval_metrics=(metric_fn, [logits, labels]), - scaffold_fn=scaffold_fn) + scaffold_fn=None) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, predictions={"probabilities": probabilities}, - scaffold_fn=scaffold_fn) + scaffold_fn=None) return output_spec return model_fn @@ -357,6 +359,7 @@ def main(_): model_dir=FLAGS.output_dir, session_config=config, save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None, + # so we only use our hook log_step_count_steps=100000000000, tpu_config=tf.contrib.tpu.TPUConfig( iterations_per_loop=FLAGS.iterations_per_loop, @@ -389,7 +392,6 @@ def main(_): model_fn = model_fn_builder( bert_config=bert_config, - init_checkpoint=FLAGS.init_checkpoint, learning_rate=learning_rate, num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, @@ -425,9 +427,16 @@ def main(_): # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 # TODO: replace with ValidationMonitor and EarlyStoppingHook for i in range(10): + init_bert_hook = InitBertHook( + initialize_bert=(i == 0), + init_checkpoint=FLAGS.init_checkpoint, + hvd=hvd) + if master_process: tf.logging.info("***** Running training *****") - # TODO: verify we are not reloading bert every time + # TODO: move init from checkpoint to a InitHook + # should restore parts of the graph on the begin call but only + # on first loop estimator.train( input_fn=train_input_fn, hooks=training_hooks, From 1e3fed12dded1652c3e7b291ccf1f475cd145234 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Sun, 16 Jun 2019 10:30:50 +0000 Subject: [PATCH 63/77] only init on first loop --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 1 + 1 file changed, 1 insertion(+) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 41cb851ba..553bddfb5 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -427,6 +427,7 @@ def main(_): # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 # TODO: replace with ValidationMonitor and EarlyStoppingHook for i in range(10): + # TODO: we should use a check on model_dir to decide if we initialize_bert init_bert_hook = InitBertHook( initialize_bert=(i == 0), init_checkpoint=FLAGS.init_checkpoint, From ea2434c51f68cda7709028c0a01cc199b62a94b4 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Tue, 18 Jun 2019 02:22:37 +0000 Subject: [PATCH 64/77] just eval --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 553bddfb5..51468a9d5 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -266,7 +266,8 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument train_op=train_op, scaffold_fn=None) elif mode == tf.estimator.ModeKeys.EVAL: - #logits.update({'labels': labels}) + + print('logits', logits, labels) def metric_fn(logits, labels): def get_update_op(_metric_fn, logits, labels): @@ -426,7 +427,9 @@ def main(_): # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 # TODO: replace with ValidationMonitor and EarlyStoppingHook - for i in range(10): + #for i in range(10): + for i in [0]: + ''' # TODO: we should use a check on model_dir to decide if we initialize_bert init_bert_hook = InitBertHook( initialize_bert=(i == 0), @@ -443,7 +446,7 @@ def main(_): hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? steps=eval_frequency_steps) - + ''' if master_process: tf.logging.info("***** Running eval *****") result = estimator.evaluate(input_fn=eval_input_fn, steps=None) From 26fdb5b629abf36a7fdcbca71974bfde15235c88 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Wed, 19 Jun 2019 19:01:08 +0000 Subject: [PATCH 65/77] clean up --- .../LanguageModeling/BERT/finetune_BERT.py | 24 +++++++++++-------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 51468a9d5..406d19d3a 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -13,6 +13,9 @@ import horovod.tensorflow as hvd import time +from fathomtf.utils.tfutils import debug_tfprint + + flags = tf.flags FLAGS = flags.FLAGS @@ -267,22 +270,23 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument scaffold_fn=None) elif mode == tf.estimator.ModeKeys.EVAL: - print('logits', logits, labels) - def metric_fn(logits, labels): + #logits = debug_tfprint('logits', logits) + #labels = debug_tfprint('label_ids', label_ids) + def metric_fn(_logits, _labels): - def get_update_op(_metric_fn, logits, labels): - update_op, _ = _metric_fn(logits, labels) + def get_update_op(_metric_fn, _logits, _labels): + update_op, _ = _metric_fn(_logits, _labels) return tf.constant(0.0), update_op return { - name: get_update_op(call, logits, labels) + name: get_update_op(call, _logits, _labels) for name, call in problem.all_metrics_fns.items() if name in problem.eval_metrics()} output_spec = tf.contrib.tpu.TPUEstimatorSpec( mode=mode, loss=total_loss, - eval_metrics=(metric_fn, [logits, labels]), + eval_metrics=(metric_fn, [logits, label_ids]), scaffold_fn=None) else: output_spec = tf.contrib.tpu.TPUEstimatorSpec( @@ -412,6 +416,7 @@ def main(_): config=run_config, train_batch_size=FLAGS.train_batch_size, eval_batch_size=FLAGS.eval_batch_size, + #eval_batch_size=1, predict_batch_size=FLAGS.predict_batch_size) train_input_fn = problem.make_estimator_input_fn( @@ -427,9 +432,8 @@ def main(_): # https://github.com/horovod/horovod/issues/182#issuecomment-401486859 # TODO: replace with ValidationMonitor and EarlyStoppingHook - #for i in range(10): - for i in [0]: - ''' + for i in range(10): + #for i in [0]: # TODO: we should use a check on model_dir to decide if we initialize_bert init_bert_hook = InitBertHook( initialize_bert=(i == 0), @@ -446,10 +450,10 @@ def main(_): hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? steps=eval_frequency_steps) - ''' if master_process: tf.logging.info("***** Running eval *****") result = estimator.evaluate(input_fn=eval_input_fn, steps=None) + #result = estimator.evaluate(input_fn=eval_input_fn, steps=1) tf.logging.info("***** Eval results *****") for key in sorted(result.keys()): tf.logging.info(" %s = %s", key, str(result[key])) From 3710f7731c177cae739972b1f030ee3fd6e6fd90 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 20 Jun 2019 00:47:46 +0000 Subject: [PATCH 66/77] check previous checkpoints --- .../LanguageModeling/BERT/finetune_BERT.py | 23 +++++++++++++------ 1 file changed, 16 insertions(+), 7 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 406d19d3a..fa647ea67 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -271,15 +271,11 @@ def model_fn(features, labels, mode, params): # pylint: disable=unused-argument elif mode == tf.estimator.ModeKeys.EVAL: #logits = debug_tfprint('logits', logits) - #labels = debug_tfprint('label_ids', label_ids) + #label_ids = debug_tfprint('label_ids', label_ids) def metric_fn(_logits, _labels): - def get_update_op(_metric_fn, _logits, _labels): - update_op, _ = _metric_fn(_logits, _labels) - return tf.constant(0.0), update_op - return { - name: get_update_op(call, _logits, _labels) + name: call(_logits, _labels) for name, call in problem.all_metrics_fns.items() if name in problem.eval_metrics()} @@ -434,9 +430,21 @@ def main(_): # TODO: replace with ValidationMonitor and EarlyStoppingHook for i in range(10): #for i in [0]: + from gcloud.gcs import fhfile + END_EXT = '.meta' + candidates = list(filter( + lambda path: path.startswith('model.ckpt'), + (os.path.basename(f) for f in fhfile.walk_path( + location=FLAGS.output_dir, + depth=1, + extension=END_EXT)))) + if candidates: + print('checkpoints exist', candidates) + print('do not initialize bert') + # TODO: we should use a check on model_dir to decide if we initialize_bert init_bert_hook = InitBertHook( - initialize_bert=(i == 0), + initialize_bert=not candidates, init_checkpoint=FLAGS.init_checkpoint, hvd=hvd) @@ -450,6 +458,7 @@ def main(_): hooks=training_hooks, # TODO: LR dependent on train steps, are we resetting this every time then? steps=eval_frequency_steps) + if master_process: tf.logging.info("***** Running eval *****") result = estimator.evaluate(input_fn=eval_input_fn, steps=None) From f63ca3d155132d13c499215209aa7343872af3ae Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Thu, 20 Jun 2019 01:37:36 +0000 Subject: [PATCH 67/77] sci --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index fa647ea67..e255fc59b 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -151,17 +151,17 @@ def after_run(self, run_context, run_values): img_per_sec = self.global_batch_size / dt if self.hvd_rank >= 0: if FLAGS.use_fp16: - print('%2d :: %6i %11.1f %6.3f %6.4e %6.4e' % + print('%2d :: %6i %11.1f %6.4e %6.4e %6.4e' % (self.hvd_rank, print_step, img_per_sec, total_loss, lr, loss_scaler)) else: - print('%2d :: %6i %11.1f %6.3f %6.4e' % + print('%2d :: %6i %11.1f %6.4f %6.4e' % (self.hvd_rank, print_step, img_per_sec, total_loss, lr)) else: if FLAGS.use_fp16: - print('%6i %11.1f %6.3f %6.4e %6.4e' % + print('%6i %11.1f %6.4f %6.4e %6.4e' % (print_step, img_per_sec, total_loss, lr, loss_scaler)) else: - print('%6i %11.1f %6.3f %6.4e' % + print('%6i %11.1f %6.4f %6.4e' % (print_step, img_per_sec, total_loss, lr)) self.elapsed_secs = 0. self.count = 0 From 95bfcc206bb2d3bb2e5f0f5813d8a81c2d5f879f Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Mon, 24 Jun 2019 21:12:43 +0000 Subject: [PATCH 68/77] prints --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index e255fc59b..f2346d07d 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -191,12 +191,12 @@ def begin(self): ) = modeling.get_assignment_map_from_checkpoint(tvars, self._init_checkpoint) tf.train.init_from_checkpoint(self._init_checkpoint, assignment_map) - tf.logging.info("**** Trainable Variables ****") + print("**** Trainable Variables ****") for var in tvars: init_string = "" if var.name in initialized_variable_names: init_string = ", *INIT_FROM_CKPT*" - tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) + print(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string) def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, @@ -441,6 +441,8 @@ def main(_): if candidates: print('checkpoints exist', candidates) print('do not initialize bert') + else: + print('initialize bert') # TODO: we should use a check on model_dir to decide if we initialize_bert init_bert_hook = InitBertHook( From 2b7f19a6e6ea64144a89ad2ba9ae1c23c957f526 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Mon, 24 Jun 2019 21:22:20 +0000 Subject: [PATCH 69/77] scale LR --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index f2346d07d..edd1f4301 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -393,7 +393,7 @@ def main(_): model_fn = model_fn_builder( bert_config=bert_config, - learning_rate=learning_rate, + learning_rate=learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(), num_train_steps=num_train_steps, num_warmup_steps=num_warmup_steps, use_tpu=FLAGS.use_tpu, From b66741c3826842927a14b41cf8e5d783f61253de Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Mon, 24 Jun 2019 21:32:01 +0000 Subject: [PATCH 70/77] append init bert hook each time --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index edd1f4301..27ea93422 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -452,12 +452,13 @@ def main(_): if master_process: tf.logging.info("***** Running training *****") + # TODO: move init from checkpoint to a InitHook # should restore parts of the graph on the begin call but only # on first loop estimator.train( input_fn=train_input_fn, - hooks=training_hooks, + hooks=training_hooks + [init_bert_hook], # TODO: LR dependent on train steps, are we resetting this every time then? steps=eval_frequency_steps) From d008553e6510ece7f46266f2d08a5815c7168da2 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 18:23:35 +0000 Subject: [PATCH 71/77] batching --- TensorFlow/LanguageModeling/BERT/modeling.py | 48 +++++++++++++++++++- 1 file changed, 47 insertions(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py index 828a93872..f56b2bd7c 100644 --- a/TensorFlow/LanguageModeling/BERT/modeling.py +++ b/TensorFlow/LanguageModeling/BERT/modeling.py @@ -166,6 +166,9 @@ def __init__(self, batch_size = input_shape[0] seq_length = input_shape[1] + #from fathomtf.utils.tfutils import debug_tfprint + #input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape) + if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) @@ -197,12 +200,40 @@ def __init__(self, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) + # start chunk + chunk_size = 64 + + # [B, T, D] + #self.embedding_output = debug_tfprint('embedding output', self.embedding_output, tf.shape) + depth = config.hidden_size + + batch_multiplier = seq_length // chunk_size + new_batch_size = batch_size * batch_multiplier + + # [B * T/chunk_size, chunk_size, D] + self.embedding_output = tf.reshape(self.embedding_output, [new_batch_size, chunk_size, depth]) + #self.embedding_output = debug_tfprint('transformed embedding output', self.embedding_output, tf.shape) + + # [B, T] + #input_mask = debug_tfprint('input mask before', input_mask, tf.shape) + #token_type_ids = debug_tfprint('token type ids before', token_type_ids, tf.shape) + # [B * T/chunk_size, chunk_size] + input_mask = tf.reshape(input_mask, [new_batch_size, chunk_size]) + token_type_ids = tf.reshape(token_type_ids, [new_batch_size, chunk_size]) + + #input_mask = debug_tfprint('input mask after', input_mask, tf.shape) + #token_type_ids = debug_tfprint('token type ids after', token_type_ids, tf.shape) + # end chunk + with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used # for the attention scores. + # [B * T/chunk_size, chunk_size, D], [B * T/chunk_size, chunk_size] + # [B * T/chunk_size, chunk_size, chunk_size] attention_mask = create_attention_mask_from_input_mask( - input_ids, input_mask) + #input_ids, input_mask) + self.embedding_output, input_mask) # Run the stacked transformer. # `sequence_output` shape = [batch_size, seq_length, hidden_size]. @@ -220,6 +251,18 @@ def __init__(self, do_return_all_layers=True) self.sequence_output = tf.cast(self.all_encoder_layers[-1], tf.float32) + + # start chunk + # [B * T/chunk_size, chunk_size, D] + #self.sequence_output = debug_tfprint('sequence output', self.sequence_output, tf.shape) + # [B, T/chunk_size, chunk_size, D] + self.sequence_output = tf.reshape( + self.sequence_output, [batch_size, batch_multiplier, chunk_size, depth]) + # [B, T/chunk_size, D] + #self.sequence_output = self.sequence_output[:, :, 0, :] + #self.sequence_output = debug_tfprint('sequence output final', self.sequence_output, tf.shape) + # end chunk + # The "pooler" converts the encoded sequence tensor of shape # [batch_size, seq_length, hidden_size] to a tensor of shape # [batch_size, hidden_size]. This is necessary for segment-level @@ -542,6 +585,9 @@ def create_attention_mask_from_input_mask(from_tensor, to_mask): to_shape = get_shape_list(to_mask, expected_rank=2) to_seq_length = to_shape[1] + #from fathomtf.utils.tfutils import debug_tfprint + #to_mask = debug_tfprint('to_mask', to_mask, tf.shape) + to_mask = tf.cast( tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32) From 2a4f869c5986084119ec2172ff1818832a7f1820 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 18:44:17 +0000 Subject: [PATCH 72/77] typo --- TensorFlow/LanguageModeling/BERT/modeling.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py index f56b2bd7c..15a10d707 100644 --- a/TensorFlow/LanguageModeling/BERT/modeling.py +++ b/TensorFlow/LanguageModeling/BERT/modeling.py @@ -259,7 +259,7 @@ def __init__(self, self.sequence_output = tf.reshape( self.sequence_output, [batch_size, batch_multiplier, chunk_size, depth]) # [B, T/chunk_size, D] - #self.sequence_output = self.sequence_output[:, :, 0, :] + self.sequence_output = self.sequence_output[:, :, 0, :] #self.sequence_output = debug_tfprint('sequence output final', self.sequence_output, tf.shape) # end chunk From 78822ce8c354033b9ec28bc45e635b38e79456e3 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 19:03:04 +0000 Subject: [PATCH 73/77] clean --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 27ea93422..9a44aad8d 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -216,12 +216,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, # [B, 384, D] body_outputs = model.get_sequence_output() - #extended_batch_size = tf.shape(body_outputs)[0] - #chunk_size = tf.shape(body_outputs)[1] - #depth = tf.shape(body_outputs)[2] - #batch_size = extended_batch_size / chunk_size - - #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth]) body_outputs = tf.expand_dims(body_outputs, axis=-2) top_out = target_modality.top(body_outputs, None) From da084d9e2e94e6a10818bfa4ea442bee08d96e38 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 21:53:54 +0000 Subject: [PATCH 74/77] chunk before embeddings --- TensorFlow/LanguageModeling/BERT/modeling.py | 44 ++++++++------------ 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py index 15a10d707..d580b7e44 100644 --- a/TensorFlow/LanguageModeling/BERT/modeling.py +++ b/TensorFlow/LanguageModeling/BERT/modeling.py @@ -166,8 +166,23 @@ def __init__(self, batch_size = input_shape[0] seq_length = input_shape[1] - #from fathomtf.utils.tfutils import debug_tfprint - #input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape) + from fathomtf.utils.tfutils import debug_tfprint + # [B, T] + input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape) + + # start chunk + chunk_size = 64 + + depth = config.hidden_size + batch_multiplier = seq_length // chunk_size + new_batch_size = batch_size * batch_multiplier + + # [B * T/chunk_size, chunk_size] + input_ids = tf.reshape(input_ids, [new_batch_size, chunk_size]) + input_mask = tf.reshape(input_mask, [new_batch_size, chunk_size]) + token_type_ids = tf.reshape(token_type_ids, [new_batch_size, chunk_size]) + + # end chunk if input_mask is None: input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32) @@ -200,31 +215,6 @@ def __init__(self, max_position_embeddings=config.max_position_embeddings, dropout_prob=config.hidden_dropout_prob) - # start chunk - chunk_size = 64 - - # [B, T, D] - #self.embedding_output = debug_tfprint('embedding output', self.embedding_output, tf.shape) - depth = config.hidden_size - - batch_multiplier = seq_length // chunk_size - new_batch_size = batch_size * batch_multiplier - - # [B * T/chunk_size, chunk_size, D] - self.embedding_output = tf.reshape(self.embedding_output, [new_batch_size, chunk_size, depth]) - #self.embedding_output = debug_tfprint('transformed embedding output', self.embedding_output, tf.shape) - - # [B, T] - #input_mask = debug_tfprint('input mask before', input_mask, tf.shape) - #token_type_ids = debug_tfprint('token type ids before', token_type_ids, tf.shape) - # [B * T/chunk_size, chunk_size] - input_mask = tf.reshape(input_mask, [new_batch_size, chunk_size]) - token_type_ids = tf.reshape(token_type_ids, [new_batch_size, chunk_size]) - - #input_mask = debug_tfprint('input mask after', input_mask, tf.shape) - #token_type_ids = debug_tfprint('token type ids after', token_type_ids, tf.shape) - # end chunk - with tf.variable_scope("encoder"): # This converts a 2D mask of shape [batch_size, seq_length] to a 3D # mask of shape [batch_size, seq_length, seq_length] which is used From 2fee2eaa892960d1fc6e95f1f9335b3bfffabbe4 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 21:57:00 +0000 Subject: [PATCH 75/77] no print --- TensorFlow/LanguageModeling/BERT/modeling.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py index d580b7e44..7345aaee6 100644 --- a/TensorFlow/LanguageModeling/BERT/modeling.py +++ b/TensorFlow/LanguageModeling/BERT/modeling.py @@ -166,9 +166,9 @@ def __init__(self, batch_size = input_shape[0] seq_length = input_shape[1] - from fathomtf.utils.tfutils import debug_tfprint + #from fathomtf.utils.tfutils import debug_tfprint # [B, T] - input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape) + #input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape) # start chunk chunk_size = 64 From de23b8adc2d36118cc9359965c101991347e1411 Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 22:47:01 +0000 Subject: [PATCH 76/77] do not expand --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 9a44aad8d..4373a444a 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -214,9 +214,8 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, use_one_hot_embeddings=use_one_hot_embeddings, compute_type=tf.float32) - # [B, 384, D] - body_outputs = model.get_sequence_output() - body_outputs = tf.expand_dims(body_outputs, axis=-2) + # [B, T/chunk_size, D] + body_output = model.get_sequence_output() top_out = target_modality.top(body_outputs, None) From a4ae57c73c75e3c435521a9d8b2a2169ad531aff Mon Sep 17 00:00:00 2001 From: rllin-fathom Date: Fri, 28 Jun 2019 22:48:19 +0000 Subject: [PATCH 77/77] typo --- TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py index 4373a444a..8b2991016 100644 --- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py +++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py @@ -217,7 +217,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids, # [B, T/chunk_size, D] body_output = model.get_sequence_output() - top_out = target_modality.top(body_outputs, None) + top_out = target_modality.top(body_output, None) num, den = target_modality.loss(top_out, labels) loss = num / den