From 875afb748c097a777c61c3f3e92007c9cfdbaca3 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 16 May 2019 18:29:16 +0000
Subject: [PATCH 01/77] fhfile

---
 TensorFlow/LanguageModeling/BERT/run_pretraining.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/run_pretraining.py b/TensorFlow/LanguageModeling/BERT/run_pretraining.py
index 3da2c5506..f4742901b 100644
--- a/TensorFlow/LanguageModeling/BERT/run_pretraining.py
+++ b/TensorFlow/LanguageModeling/BERT/run_pretraining.py
@@ -491,8 +491,12 @@ def main(_):
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
   input_files = []
-  for input_pattern in FLAGS.input_file.split(","):
-    input_files.extend(tf.gfile.Glob(input_pattern))
+  from gcloud.gcs import fhfile
+  if fhfile.IsDirectory(FLAGS.input_file):
+    input_files = list(fhfile.walk_path(FLAGS.input_file))
+  else:
+    for input_pattern in FLAGS.input_file.split(","):
+      input_files.extend(tf.gfile.Glob(input_pattern))
 
   tf.logging.info("*** Input Files ***")
   for input_file in input_files:

From ad614a383224e1d9c036dad0d3ee3bc8af42fb16 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 16 May 2019 19:47:18 +0000
Subject: [PATCH 02/77] import walk path

---
 .../LanguageModeling/BERT/run_pretraining.py  | 36 +++++++++++++++++--
 1 file changed, 33 insertions(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/run_pretraining.py b/TensorFlow/LanguageModeling/BERT/run_pretraining.py
index f4742901b..bae0d7ce9 100644
--- a/TensorFlow/LanguageModeling/BERT/run_pretraining.py
+++ b/TensorFlow/LanguageModeling/BERT/run_pretraining.py
@@ -490,10 +490,40 @@ def main(_):
 
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
+  def walk_path(location: str,
+                only_dir: bool = False,
+                depth: int = None,
+                extension: str = None):
+      """Walks through specified remote or local directory.
+  
+      Args:
+          location: local or remote directory to start walk.
+          only_dir: if True, only directories are yielded,
+              else only files.
+          depth: number of subdirectories to recursively walk through.
+              if unspecified, walk through all subdirectories.
+          extension: if specified, only files the end with this
+              extension are returned.
+      Yields:
+          local or remote path.
+  
+      """
+      for level, (root, dirs, file_names) in enumerate(
+              tf.gfile.Walk(top=location)):
+          if only_dir:
+              for dir_name in dirs:
+                  yield os.path.join(root, dir_name)
+          else:
+              for file_name in file_names:
+                  if extension and not file_name.endswith(extension):
+                      continue
+                  yield os.path.join(root, file_name)
+          if depth is not None and depth == level:
+              return
+
   input_files = []
-  from gcloud.gcs import fhfile
-  if fhfile.IsDirectory(FLAGS.input_file):
-    input_files = list(fhfile.walk_path(FLAGS.input_file))
+  if tf.gfile.Exists(FLAGS.input_file):
+    input_files = list(walk_path(FLAGS.input_file))
   else:
     for input_pattern in FLAGS.input_file.split(","):
       input_files.extend(tf.gfile.Glob(input_pattern))

From 1b61541f14b455b9eacddfbaefc381035042edaf Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 23 May 2019 18:47:24 +0000
Subject: [PATCH 03/77] turn on amp if fp16

---
 TensorFlow/LanguageModeling/BERT/run_squad.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index ff0f7b940..2f56fceeb 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -158,6 +158,10 @@
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
+if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    print('Turning on AMP')
+
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):
   def __init__(self, global_batch_size, hvd_rank=-1):

From bef3ff6d6a779223c4758879e2e476344981205f Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 01:31:26 +0000
Subject: [PATCH 04/77] use wrapper instead of flag

---
 TensorFlow/LanguageModeling/BERT/optimization.py | 8 +++++---
 TensorFlow/LanguageModeling/BERT/run_squad.py    | 6 +++---
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
index a1b912a8e..728e72378 100644
--- a/TensorFlow/LanguageModeling/BERT/optimization.py
+++ b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -75,9 +75,11 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
     if hvd is not None:
       from horovod.tensorflow.compression import Compression
       optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
-    if use_fp16 or amp:
-      loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
-      optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+    #if use_fp16 or amp:
+    if use_fp16:
+      #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+      #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
 
   tvars = tf.trainable_variables()
   grads_and_vars = optimizer.compute_gradients(loss, tvars)
diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index 2f56fceeb..a996b3b63 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -158,9 +158,9 @@
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
-if FLAGS.use_fp16:
-    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-    print('Turning on AMP')
+#if FLAGS.use_fp16:
+    #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    #print('Turning on AMP')
 
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):

From fe58436b9a865280ac8339d4818f6888129c646c Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 02:46:38 +0000
Subject: [PATCH 05/77] switch back to nvidia

---
 TensorFlow/LanguageModeling/BERT/optimization.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
index 728e72378..8b4864aa5 100644
--- a/TensorFlow/LanguageModeling/BERT/optimization.py
+++ b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -75,11 +75,11 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
     if hvd is not None:
       from horovod.tensorflow.compression import Compression
       optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
-    #if use_fp16 or amp:
-    if use_fp16:
-      #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
-      #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
+    if use_fp16 or amp:
+    #if use_fp16:
+      loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+      optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+      #optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
 
   tvars = tf.trainable_variables()
   grads_and_vars = optimizer.compute_gradients(loss, tvars)

From 0e148185dac98a7805535f959ceb5eed275a3209 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 02:47:04 +0000
Subject: [PATCH 06/77] switch back to nvidia amp

---
 TensorFlow/LanguageModeling/BERT/run_squad.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index a996b3b63..2f56fceeb 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -158,9 +158,9 @@
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
-#if FLAGS.use_fp16:
-    #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-    #print('Turning on AMP')
+if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    print('Turning on AMP')
 
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):

From ecb43379b42f542e5b039ad9658a1682e75f427a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 18:42:37 +0000
Subject: [PATCH 07/77] tf amp

---
 TensorFlow/LanguageModeling/BERT/optimization.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
index 8b4864aa5..728e72378 100644
--- a/TensorFlow/LanguageModeling/BERT/optimization.py
+++ b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -75,11 +75,11 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
     if hvd is not None:
       from horovod.tensorflow.compression import Compression
       optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
-    if use_fp16 or amp:
-    #if use_fp16:
-      loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
-      optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
-      #optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
+    #if use_fp16 or amp:
+    if use_fp16:
+      #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+      #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
 
   tvars = tf.trainable_variables()
   grads_and_vars = optimizer.compute_gradients(loss, tvars)

From 83fc718479fc5975b3279ff82c6dc895f78db28e Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 18:53:21 +0000
Subject: [PATCH 08/77] turn amp off

---
 TensorFlow/LanguageModeling/BERT/run_squad.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index 2f56fceeb..a996b3b63 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -158,9 +158,9 @@
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
-if FLAGS.use_fp16:
-    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-    print('Turning on AMP')
+#if FLAGS.use_fp16:
+    #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    #print('Turning on AMP')
 
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):

From 94820a16fa6a3ee0c58d33472a8e3e382358463b Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 19:29:16 +0000
Subject: [PATCH 09/77] print

---
 TensorFlow/LanguageModeling/BERT/optimization.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
index 728e72378..e212c8560 100644
--- a/TensorFlow/LanguageModeling/BERT/optimization.py
+++ b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -79,6 +79,7 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
     if use_fp16:
       #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
       #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+      print('wrapping with enable mixed precision graph rewrite')
       optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
 
   tvars = tf.trainable_variables()

From 9ccaed7c91f3dd4d517628570894a40865387ec8 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 19:45:12 +0000
Subject: [PATCH 10/77] or amp

---
 TensorFlow/LanguageModeling/BERT/optimization.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
index e212c8560..6371f5ca8 100644
--- a/TensorFlow/LanguageModeling/BERT/optimization.py
+++ b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -75,8 +75,8 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
     if hvd is not None:
       from horovod.tensorflow.compression import Compression
       optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
-    #if use_fp16 or amp:
-    if use_fp16:
+    if use_fp16 or amp:
+    #if use_fp16:
       #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
       #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
       print('wrapping with enable mixed precision graph rewrite')

From 651f03a7e40ddcef18c1ea0e37a3245b952da15a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 24 May 2019 23:55:03 +0000
Subject: [PATCH 11/77] turn on TF_XLA_FLAGS=--tf_xla_cpu_global_jit

---
 TensorFlow/LanguageModeling/BERT/run_squad.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index a996b3b63..ebef9a57a 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -162,6 +162,10 @@
     #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
     #print('Turning on AMP')
 
+if FLAGS.horovod:
+    print('Turning on TF_XLA_FLAGS=--tf_xla_cpu_global_jit')
+    os.environ["TF_XLA_FLAGS"] = "--tf_xla_cpu_global_jit"
+
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):
   def __init__(self, global_batch_size, hvd_rank=-1):

From 77055481683b9d5344ae55cf3473bc62ba31dd1b Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 30 May 2019 01:39:31 +0000
Subject: [PATCH 12/77] use built in amp

---
 TensorFlow/LanguageModeling/BERT/optimization.py | 8 ++++----
 TensorFlow/LanguageModeling/BERT/run_squad.py    | 8 +++++---
 2 files changed, 9 insertions(+), 7 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/optimization.py b/TensorFlow/LanguageModeling/BERT/optimization.py
index 6371f5ca8..bfa7b0dd0 100644
--- a/TensorFlow/LanguageModeling/BERT/optimization.py
+++ b/TensorFlow/LanguageModeling/BERT/optimization.py
@@ -77,10 +77,10 @@ def create_optimizer(loss, init_lr, num_train_steps, num_warmup_steps, use_tpu,
       optimizer = hvd.DistributedOptimizer(optimizer, sparse_as_dense=True, compression=Compression.none)
     if use_fp16 or amp:
     #if use_fp16:
-      #loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
-      #optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
-      print('wrapping with enable mixed precision graph rewrite')
-      optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
+      loss_scale_manager = tf.contrib.mixed_precision.ExponentialUpdateLossScaleManager(init_loss_scale=2**32, incr_every_n_steps=1000, decr_every_n_nan_or_inf=2, decr_ratio=0.5)
+      optimizer = tf.contrib.mixed_precision.LossScaleOptimizer(optimizer, loss_scale_manager)
+      #print('wrapping with enable mixed precision graph rewrite')
+      #optimizer = tf.train.experimental.enable_mixed_precision_graph_rewrite(optimizer, loss_scale='dynamic')
 
   tvars = tf.trainable_variables()
   grads_and_vars = optimizer.compute_gradients(loss, tvars)
diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index ebef9a57a..ba45d923f 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -158,13 +158,15 @@
 flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
-#if FLAGS.use_fp16:
-    #os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-    #print('Turning on AMP')
+if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    print('Turning on AMP')
 
+'''
 if FLAGS.horovod:
     print('Turning on TF_XLA_FLAGS=--tf_xla_cpu_global_jit')
     os.environ["TF_XLA_FLAGS"] = "--tf_xla_cpu_global_jit"
+'''
 
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):

From 8b36ab87cb3a70ec458dbbd4d1698dab8497a30b Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Tue, 4 Jun 2019 03:04:49 +0000
Subject: [PATCH 13/77] fine tune run file

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 1014 +++++++++++++++++
 1 file changed, 1014 insertions(+)
 create mode 100644 TensorFlow/LanguageModeling/BERT/finetune_BERT.py

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
new file mode 100644
index 000000000..0714d6a9b
--- /dev/null
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -0,0 +1,1014 @@
+# coding=utf-8
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""BERT finetuning runner."""
+
+from __future__ import absolute_import
+from __future__ import division
+from __future__ import print_function
+
+import collections
+import csv
+import os
+import modeling
+import optimization
+import tokenization
+import tensorflow as tf
+
+flags = tf.flags
+
+FLAGS = flags.FLAGS
+
+## Required parameters
+flags.DEFINE_string(
+    "data_dir", None,
+    "The input data dir. Should contain the .tsv files (or other data files) "
+    "for the task.")
+
+flags.DEFINE_string(
+    "bert_config_file", None,
+    "The config json file corresponding to the pre-trained BERT model. "
+    "This specifies the model architecture.")
+
+flags.DEFINE_string("task_name", None, "The name of the task to train.")
+
+flags.DEFINE_string("vocab_file", None,
+                    "The vocabulary file that the BERT model was trained on.")
+
+flags.DEFINE_string(
+    "output_dir", None,
+    "The output directory where the model checkpoints will be written.")
+
+## Other parameters
+
+flags.DEFINE_string(
+    "init_checkpoint", None,
+    "Initial checkpoint (usually from a pre-trained BERT model).")
+
+flags.DEFINE_bool(
+    "do_lower_case", True,
+    "Whether to lower case the input text. Should be True for uncased "
+    "models and False for cased models.")
+
+flags.DEFINE_integer(
+    "max_seq_length", 128,
+    "The maximum total input sequence length after WordPiece tokenization. "
+    "Sequences longer than this will be truncated, and sequences shorter "
+    "than this will be padded.")
+
+flags.DEFINE_bool("do_train", False, "Whether to run training.")
+
+flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
+
+flags.DEFINE_bool(
+    "do_predict", False,
+    "Whether to run the model in inference mode on the test set.")
+
+flags.DEFINE_integer("train_batch_size", 32, "Total batch size for training.")
+
+flags.DEFINE_integer("eval_batch_size", 8, "Total batch size for eval.")
+
+flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
+
+flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+
+flags.DEFINE_float("num_train_epochs", 3.0,
+                   "Total number of training epochs to perform.")
+
+flags.DEFINE_float(
+    "warmup_proportion", 0.1,
+    "Proportion of training to perform linear learning rate warmup for. "
+    "E.g., 0.1 = 10% of training.")
+
+flags.DEFINE_integer("save_checkpoints_steps", 1000,
+                     "How often to save the model checkpoint.")
+
+flags.DEFINE_integer("iterations_per_loop", 1000,
+                     "How many steps to make in each estimator call.")
+
+flags.DEFINE_bool("use_tpu", False, "Whether to use TPU or GPU/CPU.")
+
+tf.flags.DEFINE_string(
+    "tpu_name", None,
+    "The Cloud TPU to use for training. This should be either the name "
+    "used when creating the Cloud TPU, or a grpc://ip.address.of.tpu:8470 "
+    "url.")
+
+tf.flags.DEFINE_string(
+    "tpu_zone", None,
+    "[Optional] GCE zone where the Cloud TPU is located in. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.")
+
+tf.flags.DEFINE_string(
+    "gcp_project", None,
+    "[Optional] Project name for the Cloud TPU-enabled project. If not "
+    "specified, we will attempt to automatically detect the GCE project from "
+    "metadata.")
+
+tf.flags.DEFINE_string("master", None, "[Optional] TensorFlow master URL.")
+
+flags.DEFINE_integer(
+    "num_tpu_cores", 8,
+    "Only used if `use_tpu` is True. Total number of TPU cores to use.")
+
+flags.DEFINE_bool("use_fp16", False, "Whether to use fp32 or fp16 arithmetic on GPU.")
+
+flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
+
+
+class InputExample(object):
+  """A single training/test example for simple sequence classification."""
+
+  def __init__(self, guid, text_a, text_b=None, label=None):
+    """Constructs a InputExample.
+
+    Args:
+      guid: Unique id for the example.
+      text_a: string. The untokenized text of the first sequence. For single
+        sequence tasks, only this sequence must be specified.
+      text_b: (Optional) string. The untokenized text of the second sequence.
+        Only must be specified for sequence pair tasks.
+      label: (Optional) string. The label of the example. This should be
+        specified for train and dev examples, but not for test examples.
+    """
+    self.guid = guid
+    self.text_a = text_a
+    self.text_b = text_b
+    self.label = label
+
+
+class PaddingInputExample(object):
+  """Fake example so the num input examples is a multiple of the batch size.
+
+  When running eval/predict on the TPU, we need to pad the number of examples
+  to be a multiple of the batch size, because the TPU requires a fixed batch
+  size. The alternative is to drop the last batch, which is bad because it means
+  the entire output data won't be generated.
+
+  We use this class instead of `None` because treating `None` as padding
+  battches could cause silent errors.
+  """
+
+
+class InputFeatures(object):
+  """A single set of features of data."""
+
+  def __init__(self,
+               input_ids,
+               input_mask,
+               segment_ids,
+               label_id,
+               is_real_example=True):
+    self.input_ids = input_ids
+    self.input_mask = input_mask
+    self.segment_ids = segment_ids
+    self.label_id = label_id
+    self.is_real_example = is_real_example
+
+
+class DataProcessor(object):
+  """Base class for data converters for sequence classification data sets."""
+
+  def get_train_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the train set."""
+    raise NotImplementedError()
+
+  def get_dev_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for the dev set."""
+    raise NotImplementedError()
+
+  def get_test_examples(self, data_dir):
+    """Gets a collection of `InputExample`s for prediction."""
+    raise NotImplementedError()
+
+  def get_labels(self):
+    """Gets the list of labels for this data set."""
+    raise NotImplementedError()
+
+  @classmethod
+  def _read_tsv(cls, input_file, quotechar=None):
+    """Reads a tab separated value file."""
+    with tf.gfile.Open(input_file, "r") as f:
+      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
+      lines = []
+      for line in reader:
+        lines.append(line)
+      return lines
+
+
+class XnliProcessor(DataProcessor):
+  """Processor for the XNLI data set."""
+
+  def __init__(self):
+    self.language = "zh"
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(
+        os.path.join(data_dir, "multinli",
+                     "multinli.train.%s.tsv" % self.language))
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "train-%d" % (i)
+      text_a = tokenization.convert_to_unicode(line[0])
+      text_b = tokenization.convert_to_unicode(line[1])
+      label = tokenization.convert_to_unicode(line[2])
+      if label == tokenization.convert_to_unicode("contradictory"):
+        label = tokenization.convert_to_unicode("contradiction")
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "dev-%d" % (i)
+      language = tokenization.convert_to_unicode(line[0])
+      if language != tokenization.convert_to_unicode(self.language):
+        continue
+      text_a = tokenization.convert_to_unicode(line[6])
+      text_b = tokenization.convert_to_unicode(line[7])
+      label = tokenization.convert_to_unicode(line[1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+
+
+class MnliProcessor(DataProcessor):
+  """Processor for the MultiNLI data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
+        "dev_matched")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["contradiction", "entailment", "neutral"]
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
+      text_a = tokenization.convert_to_unicode(line[8])
+      text_b = tokenization.convert_to_unicode(line[9])
+      if set_type == "test":
+        label = "contradiction"
+      else:
+        label = tokenization.convert_to_unicode(line[-1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
+class MrpcProcessor(DataProcessor):
+  """Processor for the MRPC data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      if i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      text_a = tokenization.convert_to_unicode(line[3])
+      text_b = tokenization.convert_to_unicode(line[4])
+      if set_type == "test":
+        label = "0"
+      else:
+        label = tokenization.convert_to_unicode(line[0])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
+    return examples
+
+
+class ColaProcessor(DataProcessor):
+  """Processor for the CoLA data set (GLUE version)."""
+
+  def get_train_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
+
+  def get_dev_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
+
+  def get_test_examples(self, data_dir):
+    """See base class."""
+    return self._create_examples(
+        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
+
+  def get_labels(self):
+    """See base class."""
+    return ["0", "1"]
+
+  def _create_examples(self, lines, set_type):
+    """Creates examples for the training and dev sets."""
+    examples = []
+    for (i, line) in enumerate(lines):
+      # Only the test set has a header
+      if set_type == "test" and i == 0:
+        continue
+      guid = "%s-%s" % (set_type, i)
+      if set_type == "test":
+        text_a = tokenization.convert_to_unicode(line[1])
+        label = "0"
+      else:
+        text_a = tokenization.convert_to_unicode(line[3])
+        label = tokenization.convert_to_unicode(line[1])
+      examples.append(
+          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
+    return examples
+
+
+class DummyProcessor(DataProcessor):
+
+    def get_train_examples(self):
+        return self._create_examples(10, 'train')
+
+    def get_dev_examples(self):
+        return self._create_examples(2, 'dev')
+
+    def get_test_examples(self):
+        return self._create_examples(2, 'test')
+
+    def get_labels(self):
+        return ["0", "1"]
+
+    def _create_examples(self, num_lines, set_type):
+        examples = []
+        for i in range(num_lines):
+            guid = "%s-%d" % (set_type, i)
+            examples.append(InputExample(guid=guid, text_a='dummy dummy, dummy', label='0'))
+        return examples
+
+
+def convert_single_example(ex_index, example, label_list, max_seq_length,
+                           tokenizer):
+  """Converts a single `InputExample` into a single `InputFeatures`."""
+
+  if isinstance(example, PaddingInputExample):
+    return InputFeatures(
+        input_ids=[0] * max_seq_length,
+        input_mask=[0] * max_seq_length,
+        segment_ids=[0] * max_seq_length,
+        label_id=0,
+        is_real_example=False)
+
+  label_map = {}
+  for (i, label) in enumerate(label_list):
+    label_map[label] = i
+
+  tokens_a = tokenizer.tokenize(example.text_a)
+  tokens_b = None
+  if example.text_b:
+    tokens_b = tokenizer.tokenize(example.text_b)
+
+  if tokens_b:
+    # Modifies `tokens_a` and `tokens_b` in place so that the total
+    # length is less than the specified length.
+    # Account for [CLS], [SEP], [SEP] with "- 3"
+    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
+  else:
+    # Account for [CLS] and [SEP] with "- 2"
+    if len(tokens_a) > max_seq_length - 2:
+      tokens_a = tokens_a[0:(max_seq_length - 2)]
+
+  # The convention in BERT is:
+  # (a) For sequence pairs:
+  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
+  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
+  # (b) For single sequences:
+  #  tokens:   [CLS] the dog is hairy . [SEP]
+  #  type_ids: 0     0   0   0  0     0 0
+  #
+  # Where "type_ids" are used to indicate whether this is the first
+  # sequence or the second sequence. The embedding vectors for `type=0` and
+  # `type=1` were learned during pre-training and are added to the wordpiece
+  # embedding vector (and position vector). This is not *strictly* necessary
+  # since the [SEP] token unambiguously separates the sequences, but it makes
+  # it easier for the model to learn the concept of sequences.
+  #
+  # For classification tasks, the first vector (corresponding to [CLS]) is
+  # used as the "sentence vector". Note that this only makes sense because
+  # the entire model is fine-tuned.
+  tokens = []
+  segment_ids = []
+  tokens.append("[CLS]")
+  segment_ids.append(0)
+  for token in tokens_a:
+    tokens.append(token)
+    segment_ids.append(0)
+  tokens.append("[SEP]")
+  segment_ids.append(0)
+
+  if tokens_b:
+    for token in tokens_b:
+      tokens.append(token)
+      segment_ids.append(1)
+    tokens.append("[SEP]")
+    segment_ids.append(1)
+
+  input_ids = tokenizer.convert_tokens_to_ids(tokens)
+
+  # The mask has 1 for real tokens and 0 for padding tokens. Only real
+  # tokens are attended to.
+  input_mask = [1] * len(input_ids)
+
+  # Zero-pad up to the sequence length.
+  while len(input_ids) < max_seq_length:
+    input_ids.append(0)
+    input_mask.append(0)
+    segment_ids.append(0)
+
+  assert len(input_ids) == max_seq_length
+  assert len(input_mask) == max_seq_length
+  assert len(segment_ids) == max_seq_length
+
+  label_id = label_map[example.label]
+  if ex_index < 5:
+    tf.logging.info("*** Example ***")
+    tf.logging.info("guid: %s" % (example.guid))
+    tf.logging.info("tokens: %s" % " ".join(
+        [tokenization.printable_text(x) for x in tokens]))
+    tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
+    tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
+    tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
+    tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
+
+  feature = InputFeatures(
+      input_ids=input_ids,
+      input_mask=input_mask,
+      segment_ids=segment_ids,
+      label_id=label_id,
+      is_real_example=True)
+  return feature
+
+
+def file_based_convert_examples_to_features(
+    examples, label_list, max_seq_length, tokenizer, output_file):
+  """Convert a set of `InputExample`s to a TFRecord file."""
+
+  writer = tf.python_io.TFRecordWriter(output_file)
+
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+    feature = convert_single_example(ex_index, example, label_list,
+                                     max_seq_length, tokenizer)
+
+    def create_int_feature(values):
+      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
+      return f
+
+    features = collections.OrderedDict()
+    features["input_ids"] = create_int_feature(feature.input_ids)
+    features["input_mask"] = create_int_feature(feature.input_mask)
+    features["segment_ids"] = create_int_feature(feature.segment_ids)
+    features["label_ids"] = create_int_feature([feature.label_id])
+    features["is_real_example"] = create_int_feature(
+        [int(feature.is_real_example)])
+
+    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
+    writer.write(tf_example.SerializeToString())
+  writer.close()
+
+
+def file_based_input_fn_builder(input_file, seq_length, is_training,
+                                drop_remainder):
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+  name_to_features = {
+      "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
+      "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
+      "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
+      "label_ids": tf.FixedLenFeature([], tf.int64),
+      "is_real_example": tf.FixedLenFeature([], tf.int64),
+  }
+
+  def _decode_record(record, name_to_features):
+    """Decodes a record to a TensorFlow example."""
+    example = tf.parse_single_example(record, name_to_features)
+
+    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
+    # So cast all int64 to int32.
+    for name in list(example.keys()):
+      t = example[name]
+      if t.dtype == tf.int64:
+        t = tf.to_int32(t)
+      example[name] = t
+
+    return example
+
+  def input_fn(params):
+    """The actual input function."""
+    batch_size = params["batch_size"]
+
+    # For training, we want a lot of parallel reading and shuffling.
+    # For eval, we want no shuffling and parallel reading doesn't matter.
+    d = tf.data.TFRecordDataset(input_file)
+    if is_training:
+      d = d.repeat()
+      d = d.shuffle(buffer_size=100)
+
+    d = d.apply(
+        tf.contrib.data.map_and_batch(
+            lambda record: _decode_record(record, name_to_features),
+            batch_size=batch_size,
+            drop_remainder=drop_remainder))
+
+    return d
+
+  return input_fn
+
+
+def _truncate_seq_pair(tokens_a, tokens_b, max_length):
+  """Truncates a sequence pair in place to the maximum length."""
+
+  # This is a simple heuristic which will always truncate the longer sequence
+  # one token at a time. This makes more sense than truncating an equal percent
+  # of tokens from each, since if one sequence is very short then each token
+  # that's truncated likely contains more information than a longer sequence.
+  while True:
+    total_length = len(tokens_a) + len(tokens_b)
+    if total_length <= max_length:
+      break
+    if len(tokens_a) > len(tokens_b):
+      tokens_a.pop()
+    else:
+      tokens_b.pop()
+
+
+def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
+                 labels, num_labels, use_one_hot_embeddings):
+  """Creates a classification model."""
+  model = modeling.BertModel(
+      config=bert_config,
+      is_training=is_training,
+      input_ids=input_ids,
+      input_mask=input_mask,
+      token_type_ids=segment_ids,
+      use_one_hot_embeddings=use_one_hot_embeddings,
+      compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+
+  last_layer = model.get_sequence_output()
+  extended_batch_size = tf.shape(last_layer)[0]
+  chunk_size = tf.shape(last_layer)[1]
+  depth = tf.shape(last_layer)[2]
+  batch_size = extended_batch_size / chunk_size
+
+  body_outputs = tf.reshape(last_layer, [batch_size, extended_batch_batch_size, depth])
+
+  return target_modality.top(body_outputs)
+  '''
+  # In the demo, we are doing a simple classification task on the entire
+  # segment.
+  #
+  # If you want to use the token-level output, use model.get_sequence_output()
+  # instead.
+  output_layer = model.get_pooled_output()
+
+  hidden_size = output_layer.shape[-1].value
+
+  output_weights = tf.get_variable(
+      "output_weights", [num_labels, hidden_size],
+      initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+  output_bias = tf.get_variable(
+      "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+  with tf.variable_scope("loss"):
+    if is_training:
+      # I.e., 0.1 dropout
+      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
+
+    #logits = tf.matmul(output_layer, output_weights, transpose_b=True)
+    #logits = tf.nn.bias_add(logits, output_bias)
+    probabilities = tf.nn.softmax(logits, axis=-1)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+
+    return (loss, per_example_loss, logits, probabilities)
+  '''
+
+
+def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
+                     num_train_steps, num_warmup_steps, use_tpu,
+                     use_one_hot_embeddings):
+  """Returns `model_fn` closure for TPUEstimator."""
+
+  def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
+    """The `model_fn` for TPUEstimator."""
+
+    tf.logging.info("*** Features ***")
+    for name in sorted(features.keys()):
+      tf.logging.info("  name = %s, shape = %s" % (name, features[name].shape))
+
+    input_ids = features["input_ids"]
+    input_mask = features["input_mask"]
+    segment_ids = features["segment_ids"]
+    label_ids = features["label_ids"]
+    is_real_example = None
+    if "is_real_example" in features:
+      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
+    else:
+      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
+
+    is_training = (mode == tf.estimator.ModeKeys.TRAIN)
+
+    #(total_loss, per_example_loss, logits, probabilities) = create_model(
+        #bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+        #num_labels, use_one_hot_embeddings)
+    (total_loss, logits) = create_model(
+        bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
+        num_labels, use_one_hot_embeddings)
+
+    tvars = tf.trainable_variables()
+    initialized_variable_names = {}
+    scaffold_fn = None
+    if init_checkpoint:
+      (assignment_map, initialized_variable_names
+      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
+      if use_tpu:
+
+        def tpu_scaffold():
+          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+          return tf.train.Scaffold()
+
+        scaffold_fn = tpu_scaffold
+      else:
+        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
+
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
+                      init_string)
+
+    output_spec = None
+    if mode == tf.estimator.ModeKeys.TRAIN:
+
+      train_op = optimization.create_optimizer(
+          total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu,
+          None, FLAGS.use_fp16)
+
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          train_op=train_op,
+          scaffold_fn=scaffold_fn)
+    elif mode == tf.estimator.ModeKeys.EVAL:
+      '''
+      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
+        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
+        accuracy = tf.metrics.accuracy(
+            labels=label_ids, predictions=predictions, weights=is_real_example)
+        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
+        return {
+            "eval_accuracy": accuracy,
+            "eval_loss": loss,
+        }
+
+      eval_metrics = (metric_fn,
+                      [per_example_loss, label_ids, logits, is_real_example])
+      '''
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          loss=total_loss,
+          #eval_metrics=eval_metrics,
+          eval_metrics=problem.eval_metrics,
+          scaffold_fn=scaffold_fn)
+    else:
+      output_spec = tf.contrib.tpu.TPUEstimatorSpec(
+          mode=mode,
+          predictions={"probabilities": probabilities},
+          scaffold_fn=scaffold_fn)
+    return output_spec
+
+  return model_fn
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def input_fn_builder(features, seq_length, is_training, drop_remainder):
+  """Creates an `input_fn` closure to be passed to TPUEstimator."""
+
+  all_input_ids = []
+  all_input_mask = []
+  all_segment_ids = []
+  all_label_ids = []
+
+  for feature in features:
+    all_input_ids.append(feature.input_ids)
+    all_input_mask.append(feature.input_mask)
+    all_segment_ids.append(feature.segment_ids)
+    all_label_ids.append(feature.label_id)
+
+  def input_fn(params):
+    """The actual input function."""
+    batch_size = params["batch_size"]
+
+    num_examples = len(features)
+
+    # This is for demo purposes and does NOT scale to large data sets. We do
+    # not use Dataset.from_generator() because that uses tf.py_func which is
+    # not TPU compatible. The right way to load data is with TFRecordReader.
+    d = tf.data.Dataset.from_tensor_slices({
+        "input_ids":
+            tf.constant(
+                all_input_ids, shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "input_mask":
+            tf.constant(
+                all_input_mask,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "segment_ids":
+            tf.constant(
+                all_segment_ids,
+                shape=[num_examples, seq_length],
+                dtype=tf.int32),
+        "label_ids":
+            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
+    })
+
+    if is_training:
+      d = d.repeat()
+      d = d.shuffle(buffer_size=100)
+
+    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
+    return d
+
+  return input_fn
+
+
+# This function is not used by this file but is still used by the Colab and
+# people who depend on it.
+def convert_examples_to_features(examples, label_list, max_seq_length,
+                                 tokenizer):
+  """Convert a set of `InputExample`s to a list of `InputFeatures`."""
+
+  features = []
+  for (ex_index, example) in enumerate(examples):
+    if ex_index % 10000 == 0:
+      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
+
+    feature = convert_single_example(ex_index, example, label_list,
+                                     max_seq_length, tokenizer)
+
+    features.append(feature)
+  return features
+
+
+def main(_):
+  tf.logging.set_verbosity(tf.logging.INFO)
+
+  processors = {
+      "cola": ColaProcessor,
+      "mnli": MnliProcessor,
+      "mrpc": MrpcProcessor,
+      "xnli": XnliProcessor,
+      "dummy": DummyProcessor
+  }
+
+  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
+                                                FLAGS.init_checkpoint)
+
+  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
+    raise ValueError(
+        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
+
+  bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
+
+  if FLAGS.max_seq_length > bert_config.max_position_embeddings:
+    raise ValueError(
+        "Cannot use sequence length %d because the BERT model "
+        "was only trained up to sequence length %d" %
+        (FLAGS.max_seq_length, bert_config.max_position_embeddings))
+
+  tf.gfile.MakeDirs(FLAGS.output_dir)
+
+  #task_name = FLAGS.task_name.lower()
+
+  task_name = 'dummy'
+
+  if task_name not in processors:
+    raise ValueError("Task not found: %s" % (task_name))
+
+  processor = processors[task_name]()
+
+  label_list = processor.get_labels()
+
+  tokenizer = tokenization.FullTokenizer(
+      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+
+  tpu_cluster_resolver = None
+  if FLAGS.use_tpu and FLAGS.tpu_name:
+    tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
+        FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
+
+  config = tf.ConfigProto()
+  if FLAGS.use_xla: 
+    config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+  is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
+  run_config = tf.contrib.tpu.RunConfig(
+      cluster=tpu_cluster_resolver,
+      master=FLAGS.master,
+      model_dir=FLAGS.output_dir,
+      session_config=config,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+      tpu_config=tf.contrib.tpu.TPUConfig(
+          iterations_per_loop=FLAGS.iterations_per_loop,
+          num_shards=FLAGS.num_tpu_cores,
+          per_host_input_for_training=is_per_host))
+
+  train_examples = None
+  num_train_steps = None
+  num_warmup_steps = None
+  if FLAGS.do_train:
+    #train_examples = processor.get_train_examples(FLAGS.data_dir)
+    train_examples = processor.get_train_examples()
+    num_train_steps = int(
+        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
+    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      num_labels=len(label_list),
+      init_checkpoint=FLAGS.init_checkpoint,
+      learning_rate=FLAGS.learning_rate,
+      num_train_steps=num_train_steps,
+      num_warmup_steps=num_warmup_steps,
+      use_tpu=FLAGS.use_tpu,
+      use_one_hot_embeddings=FLAGS.use_tpu)
+
+  # If TPU is not available, this will fall back to normal Estimator on CPU
+  # or GPU.
+  from tensor2tensor.utils.trainer_lib import create_hparams, add_problem_hparams
+  import fathomt2t
+  from fathomt2t.common_flags import setup_dataset_flag
+  import fathomt2t.problems.fprecord_text_problem
+  print('FLAGS', FLAGS)
+  print('code mapping file', FLAGS.code_mapping_file)
+  problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints'
+  hparams_set = 'fh_transformer_tiny_multi_hints_4_layers_flat_lr_min_length'
+  setup_dataset_flag()
+  FLAGS.dataset_split = 'train'
+  hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name)
+  #problem = registry.problem(problem_name)
+  add_problem_hparams(hparams, problem_name)
+  target_modality = hparams.target_modality
+  problem = hparams.problem
+
+  #from tensor2tensor.bin.t2t_trainer import create_run_config
+  #run_config = create_run_config(hparams)
+  estimator = tf.contrib.tpu.TPUEstimator(
+  #estimator = tf.estimator.Estimator(
+      use_tpu=FLAGS.use_tpu,
+      model_fn=model_fn,
+      config=run_config,
+      train_batch_size=FLAGS.train_batch_size,
+      eval_batch_size=FLAGS.eval_batch_size,
+      predict_batch_size=FLAGS.predict_batch_size)
+
+  if FLAGS.do_train:
+    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
+    file_based_convert_examples_to_features(
+        train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
+    tf.logging.info("***** Running training *****")
+    tf.logging.info("  Num examples = %d", len(train_examples))
+    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+    tf.logging.info("  Num steps = %d", num_train_steps)
+    train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
+    '''
+    train_input_fn = file_based_input_fn_builder(
+        input_file=train_file,
+        seq_length=FLAGS.max_seq_length,
+        is_training=True,
+        drop_remainder=True)
+    '''
+    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+
+  if FLAGS.do_eval:
+    #eval_examples = processor.get_dev_examples(FLAGS.data_dir)
+    eval_examples = processor.get_dev_examples()
+    num_actual_eval_examples = len(eval_examples)
+    if FLAGS.use_tpu:
+      # TPU requires a fixed batch size for all batches, therefore the number
+      # of examples must be a multiple of the batch size, or else examples
+      # will get dropped. So we pad with fake examples which are ignored
+      # later on. These do NOT count towards the metric (all tf.metrics
+      # support a per-instance weight, and these get a weight of 0.0).
+      while len(eval_examples) % FLAGS.eval_batch_size != 0:
+        eval_examples.append(PaddingInputExample())
+
+    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
+    file_based_convert_examples_to_features(
+        eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
+
+    tf.logging.info("***** Running evaluation *****")
+    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
+                    len(eval_examples), num_actual_eval_examples,
+                    len(eval_examples) - num_actual_eval_examples)
+    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
+
+    # This tells the estimator to run through the entire set.
+    eval_steps = None
+    # However, if running eval on the TPU, you will need to specify the
+    # number of steps.
+    if FLAGS.use_tpu:
+      assert len(eval_examples) % FLAGS.eval_batch_size == 0
+      eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)
+
+    eval_drop_remainder = True if FLAGS.use_tpu else False
+
+    eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams)
+    '''
+    eval_input_fn = file_based_input_fn_builder(
+        input_file=eval_file,
+        seq_length=FLAGS.max_seq_length,
+        is_training=False,
+        drop_remainder=eval_drop_remainder)
+    '''
+
+    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+
+    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
+    with tf.gfile.GFile(output_eval_file, "w") as writer:
+      tf.logging.info("***** Eval results *****")
+      for key in sorted(result.keys()):
+        tf.logging.info("  %s = %s", key, str(result[key]))
+        writer.write("%s = %s\n" % (key, str(result[key])))
+
+
+if __name__ == "__main__":
+  #flags.mark_flag_as_required("data_dir")
+  #flags.mark_flag_as_required("task_name")
+  flags.mark_flag_as_required("vocab_file")
+  flags.mark_flag_as_required("bert_config_file")
+  flags.mark_flag_as_required("output_dir")
+  tf.app.run()

From 4750f0c146bb8b5f81dcab94db13ae4c835f3653 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Wed, 5 Jun 2019 00:05:05 +0000
Subject: [PATCH 14/77] clean up bert finetune

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 306 +++---------------
 1 file changed, 42 insertions(+), 264 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 0714d6a9b..2ffd48098 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -1,17 +1,3 @@
-# coding=utf-8
-# Copyright 2018 The Google AI Language Team Authors.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
 """BERT finetuning runner."""
 
 from __future__ import absolute_import
@@ -127,7 +113,6 @@
 
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
-
 class InputExample(object):
   """A single training/test example for simple sequence classification."""
 
@@ -208,176 +193,6 @@ def _read_tsv(cls, input_file, quotechar=None):
       return lines
 
 
-class XnliProcessor(DataProcessor):
-  """Processor for the XNLI data set."""
-
-  def __init__(self):
-    self.language = "zh"
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(
-        os.path.join(data_dir, "multinli",
-                     "multinli.train.%s.tsv" % self.language))
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "train-%d" % (i)
-      text_a = tokenization.convert_to_unicode(line[0])
-      text_b = tokenization.convert_to_unicode(line[1])
-      label = tokenization.convert_to_unicode(line[2])
-      if label == tokenization.convert_to_unicode("contradictory"):
-        label = tokenization.convert_to_unicode("contradiction")
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    lines = self._read_tsv(os.path.join(data_dir, "xnli.dev.tsv"))
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "dev-%d" % (i)
-      language = tokenization.convert_to_unicode(line[0])
-      if language != tokenization.convert_to_unicode(self.language):
-        continue
-      text_a = tokenization.convert_to_unicode(line[6])
-      text_b = tokenization.convert_to_unicode(line[7])
-      label = tokenization.convert_to_unicode(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-
-
-class MnliProcessor(DataProcessor):
-  """Processor for the MultiNLI data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev_matched.tsv")),
-        "dev_matched")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test_matched.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["contradiction", "entailment", "neutral"]
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, tokenization.convert_to_unicode(line[0]))
-      text_a = tokenization.convert_to_unicode(line[8])
-      text_b = tokenization.convert_to_unicode(line[9])
-      if set_type == "test":
-        label = "contradiction"
-      else:
-        label = tokenization.convert_to_unicode(line[-1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class MrpcProcessor(DataProcessor):
-  """Processor for the MRPC data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      if i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      text_a = tokenization.convert_to_unicode(line[3])
-      text_b = tokenization.convert_to_unicode(line[4])
-      if set_type == "test":
-        label = "0"
-      else:
-        label = tokenization.convert_to_unicode(line[0])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=text_b, label=label))
-    return examples
-
-
-class ColaProcessor(DataProcessor):
-  """Processor for the CoLA data set (GLUE version)."""
-
-  def get_train_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "train.tsv")), "train")
-
-  def get_dev_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "dev.tsv")), "dev")
-
-  def get_test_examples(self, data_dir):
-    """See base class."""
-    return self._create_examples(
-        self._read_tsv(os.path.join(data_dir, "test.tsv")), "test")
-
-  def get_labels(self):
-    """See base class."""
-    return ["0", "1"]
-
-  def _create_examples(self, lines, set_type):
-    """Creates examples for the training and dev sets."""
-    examples = []
-    for (i, line) in enumerate(lines):
-      # Only the test set has a header
-      if set_type == "test" and i == 0:
-        continue
-      guid = "%s-%s" % (set_type, i)
-      if set_type == "test":
-        text_a = tokenization.convert_to_unicode(line[1])
-        label = "0"
-      else:
-        text_a = tokenization.convert_to_unicode(line[3])
-        label = tokenization.convert_to_unicode(line[1])
-      examples.append(
-          InputExample(guid=guid, text_a=text_a, text_b=None, label=label))
-    return examples
-
-
 class DummyProcessor(DataProcessor):
 
     def get_train_examples(self):
@@ -598,8 +413,11 @@ def _truncate_seq_pair(tokens_a, tokens_b, max_length):
 
 
 def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
-                 labels, num_labels, use_one_hot_embeddings):
+                 labels, num_labels, use_one_hot_embeddings, hparams):
   """Creates a classification model."""
+  target_modality = hparams.problem_hparams.target_modality
+  input_modality = hparams.problem_hparams.input_modality
+
   model = modeling.BertModel(
       config=bert_config,
       is_training=is_training,
@@ -609,54 +427,31 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
       compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
 
-  last_layer = model.get_sequence_output()
-  extended_batch_size = tf.shape(last_layer)[0]
-  chunk_size = tf.shape(last_layer)[1]
-  depth = tf.shape(last_layer)[2]
-  batch_size = extended_batch_size / chunk_size
-
-  body_outputs = tf.reshape(last_layer, [batch_size, extended_batch_batch_size, depth])
-
-  return target_modality.top(body_outputs)
-  '''
-  # In the demo, we are doing a simple classification task on the entire
-  # segment.
-  #
-  # If you want to use the token-level output, use model.get_sequence_output()
-  # instead.
-  output_layer = model.get_pooled_output()
-
-  hidden_size = output_layer.shape[-1].value
+  # [B, 384, D]
+  body_outputs = model.get_sequence_output()
+  #extended_batch_size = tf.shape(body_outputs)[0]
+  #chunk_size = tf.shape(body_outputs)[1]
+  #depth = tf.shape(body_outputs)[2]
+  #batch_size = extended_batch_size / chunk_size
 
-  output_weights = tf.get_variable(
-      "output_weights", [num_labels, hidden_size],
-      initializer=tf.truncated_normal_initializer(stddev=0.02))
+  #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
 
-  output_bias = tf.get_variable(
-      "output_bias", [num_labels], initializer=tf.zeros_initializer())
-
-  with tf.variable_scope("loss"):
-    if is_training:
-      # I.e., 0.1 dropout
-      output_layer = tf.nn.dropout(output_layer, keep_prob=0.9)
-
-    #logits = tf.matmul(output_layer, output_weights, transpose_b=True)
-    #logits = tf.nn.bias_add(logits, output_bias)
-    probabilities = tf.nn.softmax(logits, axis=-1)
-    log_probs = tf.nn.log_softmax(logits, axis=-1)
-
-    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
-
-    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
-    loss = tf.reduce_mean(per_example_loss)
+  body_outputs = tf.expand_dims(body_outputs, axis=-2)
+  features = {
+    'targets': labels
+  }
+  labels = tf.reshape(labels, [tf.shape(labels)[0], 1, 1, 1])
+  top_out = target_modality.top(body_outputs, features)
+  num, den = target_modality.loss(top_out, labels)
+  print('num, den', num, den)
+  loss = num / den
 
-    return (loss, per_example_loss, logits, probabilities)
-  '''
+  return loss, top_out['logits']
 
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
-                     use_one_hot_embeddings):
+                     use_one_hot_embeddings, hparams):
   """Returns `model_fn` closure for TPUEstimator."""
 
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
@@ -678,12 +473,9 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 
     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
-    #(total_loss, per_example_loss, logits, probabilities) = create_model(
-        #bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
-        #num_labels, use_one_hot_embeddings)
     (total_loss, logits) = create_model(
         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
-        num_labels, use_one_hot_embeddings)
+        num_labels, use_one_hot_embeddings, hparams)
 
     tvars = tf.trainable_variables()
     initialized_variable_names = {}
@@ -722,24 +514,9 @@ def tpu_scaffold():
           train_op=train_op,
           scaffold_fn=scaffold_fn)
     elif mode == tf.estimator.ModeKeys.EVAL:
-      '''
-      def metric_fn(per_example_loss, label_ids, logits, is_real_example):
-        predictions = tf.argmax(logits, axis=-1, output_type=tf.int32)
-        accuracy = tf.metrics.accuracy(
-            labels=label_ids, predictions=predictions, weights=is_real_example)
-        loss = tf.metrics.mean(values=per_example_loss, weights=is_real_example)
-        return {
-            "eval_accuracy": accuracy,
-            "eval_loss": loss,
-        }
-
-      eval_metrics = (metric_fn,
-                      [per_example_loss, label_ids, logits, is_real_example])
-      '''
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
           loss=total_loss,
-          #eval_metrics=eval_metrics,
           eval_metrics=problem.eval_metrics,
           scaffold_fn=scaffold_fn)
     else:
@@ -828,10 +605,6 @@ def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
 
   processors = {
-      "cola": ColaProcessor,
-      "mnli": MnliProcessor,
-      "mrpc": MrpcProcessor,
-      "xnli": XnliProcessor,
       "dummy": DummyProcessor
   }
 
@@ -896,16 +669,6 @@ def main(_):
         len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
     num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
 
-  model_fn = model_fn_builder(
-      bert_config=bert_config,
-      num_labels=len(label_list),
-      init_checkpoint=FLAGS.init_checkpoint,
-      learning_rate=FLAGS.learning_rate,
-      num_train_steps=num_train_steps,
-      num_warmup_steps=num_warmup_steps,
-      use_tpu=FLAGS.use_tpu,
-      use_one_hot_embeddings=FLAGS.use_tpu)
-
   # If TPU is not available, this will fall back to normal Estimator on CPU
   # or GPU.
   from tensor2tensor.utils.trainer_lib import create_hparams, add_problem_hparams
@@ -915,7 +678,7 @@ def main(_):
   print('FLAGS', FLAGS)
   print('code mapping file', FLAGS.code_mapping_file)
   problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints'
-  hparams_set = 'fh_transformer_tiny_multi_hints_4_layers_flat_lr_min_length'
+  hparams_set = 'finetune_bert'
   setup_dataset_flag()
   FLAGS.dataset_split = 'train'
   hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name)
@@ -924,6 +687,21 @@ def main(_):
   target_modality = hparams.target_modality
   problem = hparams.problem
 
+  if FLAGS.use_fp16:
+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+      print('Turning on AMP')
+
+  model_fn = model_fn_builder(
+      bert_config=bert_config,
+      num_labels=len(label_list),
+      init_checkpoint=FLAGS.init_checkpoint,
+      learning_rate=FLAGS.learning_rate,
+      num_train_steps=num_train_steps,
+      num_warmup_steps=num_warmup_steps,
+      use_tpu=FLAGS.use_tpu,
+      use_one_hot_embeddings=FLAGS.use_tpu,
+      hparams=hparams)
+
   #from tensor2tensor.bin.t2t_trainer import create_run_config
   #run_config = create_run_config(hparams)
   estimator = tf.contrib.tpu.TPUEstimator(
@@ -943,14 +721,14 @@ def main(_):
     tf.logging.info("  Num examples = %d", len(train_examples))
     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
     tf.logging.info("  Num steps = %d", num_train_steps)
-    train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
-    '''
+    #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
+    #'''
     train_input_fn = file_based_input_fn_builder(
         input_file=train_file,
         seq_length=FLAGS.max_seq_length,
         is_training=True,
         drop_remainder=True)
-    '''
+    #'''
     estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
 
   if FLAGS.do_eval:

From f770029e872fd54fec9cce3dfde3778e77f189be Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 01:58:58 +0000
Subject: [PATCH 15/77] clean up and hvd fp16

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 414 ++++--------------
 1 file changed, 87 insertions(+), 327 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 2ffd48098..f6a82a735 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -9,8 +9,9 @@
 import os
 import modeling
 import optimization
-import tokenization
 import tensorflow as tf
+import horovod.tensorflow as hvd
+import time
 
 flags = tf.flags
 
@@ -36,6 +37,8 @@
     "output_dir", None,
     "The output directory where the model checkpoints will be written.")
 
+flags.DEFINE_string("tmp_dir", None, '')
+
 ## Other parameters
 
 flags.DEFINE_string(
@@ -68,6 +71,7 @@
 flags.DEFINE_integer("predict_batch_size", 8, "Total batch size for predict.")
 
 flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
+flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 
 flags.DEFINE_float("num_train_epochs", 3.0,
                    "Total number of training epochs to perform.")
@@ -113,6 +117,39 @@
 
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
+# report samples/sec, total loss and learning rate during training
+class _LogEvalRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, hvd_rank=-1):
+    self.global_batch_size = global_batch_size
+    self.hvd_rank = hvd_rank
+    self.total_time = 0.0
+    self.count = 0
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+
+  def after_run(self, run_context, run_values):
+    elapsed_secs = time.time() - self.t0
+    self.total_time += elapsed_secs
+    self.count += 1
+
+# report samples/sec, total loss and learning rate during training
+class _LogTrainRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, hvd_rank=-1):
+    self.global_batch_size = global_batch_size
+    self.hvd_rank = hvd_rank
+    self.total_time = 0.0
+    self.count = 0
+
+  def before_run(self, run_context):
+    self.t0 = time.time()
+    return tf.train.SessionRunArgs(
+        fetches=['step_update:0'])
+  def after_run(self, run_context, run_values):
+    elapsed_secs = time.time() - self.t0
+    self.total_time += elapsed_secs
+    self.count += 1
+
 class InputExample(object):
   """A single training/test example for simple sequence classification."""
 
@@ -215,186 +252,6 @@ def _create_examples(self, num_lines, set_type):
         return examples
 
 
-def convert_single_example(ex_index, example, label_list, max_seq_length,
-                           tokenizer):
-  """Converts a single `InputExample` into a single `InputFeatures`."""
-
-  if isinstance(example, PaddingInputExample):
-    return InputFeatures(
-        input_ids=[0] * max_seq_length,
-        input_mask=[0] * max_seq_length,
-        segment_ids=[0] * max_seq_length,
-        label_id=0,
-        is_real_example=False)
-
-  label_map = {}
-  for (i, label) in enumerate(label_list):
-    label_map[label] = i
-
-  tokens_a = tokenizer.tokenize(example.text_a)
-  tokens_b = None
-  if example.text_b:
-    tokens_b = tokenizer.tokenize(example.text_b)
-
-  if tokens_b:
-    # Modifies `tokens_a` and `tokens_b` in place so that the total
-    # length is less than the specified length.
-    # Account for [CLS], [SEP], [SEP] with "- 3"
-    _truncate_seq_pair(tokens_a, tokens_b, max_seq_length - 3)
-  else:
-    # Account for [CLS] and [SEP] with "- 2"
-    if len(tokens_a) > max_seq_length - 2:
-      tokens_a = tokens_a[0:(max_seq_length - 2)]
-
-  # The convention in BERT is:
-  # (a) For sequence pairs:
-  #  tokens:   [CLS] is this jack ##son ##ville ? [SEP] no it is not . [SEP]
-  #  type_ids: 0     0  0    0    0     0       0 0     1  1  1  1   1 1
-  # (b) For single sequences:
-  #  tokens:   [CLS] the dog is hairy . [SEP]
-  #  type_ids: 0     0   0   0  0     0 0
-  #
-  # Where "type_ids" are used to indicate whether this is the first
-  # sequence or the second sequence. The embedding vectors for `type=0` and
-  # `type=1` were learned during pre-training and are added to the wordpiece
-  # embedding vector (and position vector). This is not *strictly* necessary
-  # since the [SEP] token unambiguously separates the sequences, but it makes
-  # it easier for the model to learn the concept of sequences.
-  #
-  # For classification tasks, the first vector (corresponding to [CLS]) is
-  # used as the "sentence vector". Note that this only makes sense because
-  # the entire model is fine-tuned.
-  tokens = []
-  segment_ids = []
-  tokens.append("[CLS]")
-  segment_ids.append(0)
-  for token in tokens_a:
-    tokens.append(token)
-    segment_ids.append(0)
-  tokens.append("[SEP]")
-  segment_ids.append(0)
-
-  if tokens_b:
-    for token in tokens_b:
-      tokens.append(token)
-      segment_ids.append(1)
-    tokens.append("[SEP]")
-    segment_ids.append(1)
-
-  input_ids = tokenizer.convert_tokens_to_ids(tokens)
-
-  # The mask has 1 for real tokens and 0 for padding tokens. Only real
-  # tokens are attended to.
-  input_mask = [1] * len(input_ids)
-
-  # Zero-pad up to the sequence length.
-  while len(input_ids) < max_seq_length:
-    input_ids.append(0)
-    input_mask.append(0)
-    segment_ids.append(0)
-
-  assert len(input_ids) == max_seq_length
-  assert len(input_mask) == max_seq_length
-  assert len(segment_ids) == max_seq_length
-
-  label_id = label_map[example.label]
-  if ex_index < 5:
-    tf.logging.info("*** Example ***")
-    tf.logging.info("guid: %s" % (example.guid))
-    tf.logging.info("tokens: %s" % " ".join(
-        [tokenization.printable_text(x) for x in tokens]))
-    tf.logging.info("input_ids: %s" % " ".join([str(x) for x in input_ids]))
-    tf.logging.info("input_mask: %s" % " ".join([str(x) for x in input_mask]))
-    tf.logging.info("segment_ids: %s" % " ".join([str(x) for x in segment_ids]))
-    tf.logging.info("label: %s (id = %d)" % (example.label, label_id))
-
-  feature = InputFeatures(
-      input_ids=input_ids,
-      input_mask=input_mask,
-      segment_ids=segment_ids,
-      label_id=label_id,
-      is_real_example=True)
-  return feature
-
-
-def file_based_convert_examples_to_features(
-    examples, label_list, max_seq_length, tokenizer, output_file):
-  """Convert a set of `InputExample`s to a TFRecord file."""
-
-  writer = tf.python_io.TFRecordWriter(output_file)
-
-  for (ex_index, example) in enumerate(examples):
-    if ex_index % 10000 == 0:
-      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)
-
-    def create_int_feature(values):
-      f = tf.train.Feature(int64_list=tf.train.Int64List(value=list(values)))
-      return f
-
-    features = collections.OrderedDict()
-    features["input_ids"] = create_int_feature(feature.input_ids)
-    features["input_mask"] = create_int_feature(feature.input_mask)
-    features["segment_ids"] = create_int_feature(feature.segment_ids)
-    features["label_ids"] = create_int_feature([feature.label_id])
-    features["is_real_example"] = create_int_feature(
-        [int(feature.is_real_example)])
-
-    tf_example = tf.train.Example(features=tf.train.Features(feature=features))
-    writer.write(tf_example.SerializeToString())
-  writer.close()
-
-
-def file_based_input_fn_builder(input_file, seq_length, is_training,
-                                drop_remainder):
-  """Creates an `input_fn` closure to be passed to TPUEstimator."""
-
-  name_to_features = {
-      "input_ids": tf.FixedLenFeature([seq_length], tf.int64),
-      "input_mask": tf.FixedLenFeature([seq_length], tf.int64),
-      "segment_ids": tf.FixedLenFeature([seq_length], tf.int64),
-      "label_ids": tf.FixedLenFeature([], tf.int64),
-      "is_real_example": tf.FixedLenFeature([], tf.int64),
-  }
-
-  def _decode_record(record, name_to_features):
-    """Decodes a record to a TensorFlow example."""
-    example = tf.parse_single_example(record, name_to_features)
-
-    # tf.Example only supports tf.int64, but the TPU only supports tf.int32.
-    # So cast all int64 to int32.
-    for name in list(example.keys()):
-      t = example[name]
-      if t.dtype == tf.int64:
-        t = tf.to_int32(t)
-      example[name] = t
-
-    return example
-
-  def input_fn(params):
-    """The actual input function."""
-    batch_size = params["batch_size"]
-
-    # For training, we want a lot of parallel reading and shuffling.
-    # For eval, we want no shuffling and parallel reading doesn't matter.
-    d = tf.data.TFRecordDataset(input_file)
-    if is_training:
-      d = d.repeat()
-      d = d.shuffle(buffer_size=100)
-
-    d = d.apply(
-        tf.contrib.data.map_and_batch(
-            lambda record: _decode_record(record, name_to_features),
-            batch_size=batch_size,
-            drop_remainder=drop_remainder))
-
-    return d
-
-  return input_fn
-
-
 def _truncate_seq_pair(tokens_a, tokens_b, max_length):
   """Truncates a sequence pair in place to the maximum length."""
 
@@ -441,7 +298,9 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
     'targets': labels
   }
   labels = tf.reshape(labels, [tf.shape(labels)[0], 1, 1, 1])
+
   top_out = target_modality.top(body_outputs, features)
+
   num, den = target_modality.loss(top_out, labels)
   print('num, den', num, den)
   loss = num / den
@@ -451,7 +310,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
-                     use_one_hot_embeddings, hparams):
+                     use_one_hot_embeddings, hparams, hvd=None, use_fp16=False):
   """Returns `model_fn` closure for TPUEstimator."""
 
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
@@ -464,12 +323,7 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     input_ids = features["input_ids"]
     input_mask = features["input_mask"]
     segment_ids = features["segment_ids"]
-    label_ids = features["label_ids"]
-    is_real_example = None
-    if "is_real_example" in features:
-      is_real_example = tf.cast(features["is_real_example"], dtype=tf.float32)
-    else:
-      is_real_example = tf.ones(tf.shape(label_ids), dtype=tf.float32)
+    label_ids = features["targets"]
 
     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
@@ -480,7 +334,7 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     tvars = tf.trainable_variables()
     initialized_variable_names = {}
     scaffold_fn = None
-    if init_checkpoint:
+    if init_checkpoint and (hvd is None or hvd.rank() == 0):
       (assignment_map, initialized_variable_names
       ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
       if use_tpu:
@@ -498,15 +352,14 @@ def tpu_scaffold():
       init_string = ""
       if var.name in initialized_variable_names:
         init_string = ", *INIT_FROM_CKPT*"
-      tf.logging.info("  name = %s, shape = %s%s", var.name, var.shape,
-                      init_string)
+      tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string)
 
     output_spec = None
     if mode == tf.estimator.ModeKeys.TRAIN:
 
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu,
-          None, FLAGS.use_fp16)
+          hvd, amp=use_fp16)
 
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
@@ -529,87 +382,18 @@ def tpu_scaffold():
   return model_fn
 
 
-# This function is not used by this file but is still used by the Colab and
-# people who depend on it.
-def input_fn_builder(features, seq_length, is_training, drop_remainder):
-  """Creates an `input_fn` closure to be passed to TPUEstimator."""
-
-  all_input_ids = []
-  all_input_mask = []
-  all_segment_ids = []
-  all_label_ids = []
-
-  for feature in features:
-    all_input_ids.append(feature.input_ids)
-    all_input_mask.append(feature.input_mask)
-    all_segment_ids.append(feature.segment_ids)
-    all_label_ids.append(feature.label_id)
-
-  def input_fn(params):
-    """The actual input function."""
-    batch_size = params["batch_size"]
-
-    num_examples = len(features)
-
-    # This is for demo purposes and does NOT scale to large data sets. We do
-    # not use Dataset.from_generator() because that uses tf.py_func which is
-    # not TPU compatible. The right way to load data is with TFRecordReader.
-    d = tf.data.Dataset.from_tensor_slices({
-        "input_ids":
-            tf.constant(
-                all_input_ids, shape=[num_examples, seq_length],
-                dtype=tf.int32),
-        "input_mask":
-            tf.constant(
-                all_input_mask,
-                shape=[num_examples, seq_length],
-                dtype=tf.int32),
-        "segment_ids":
-            tf.constant(
-                all_segment_ids,
-                shape=[num_examples, seq_length],
-                dtype=tf.int32),
-        "label_ids":
-            tf.constant(all_label_ids, shape=[num_examples], dtype=tf.int32),
-    })
-
-    if is_training:
-      d = d.repeat()
-      d = d.shuffle(buffer_size=100)
-
-    d = d.batch(batch_size=batch_size, drop_remainder=drop_remainder)
-    return d
-
-  return input_fn
-
-
-# This function is not used by this file but is still used by the Colab and
-# people who depend on it.
-def convert_examples_to_features(examples, label_list, max_seq_length,
-                                 tokenizer):
-  """Convert a set of `InputExample`s to a list of `InputFeatures`."""
-
-  features = []
-  for (ex_index, example) in enumerate(examples):
-    if ex_index % 10000 == 0:
-      tf.logging.info("Writing example %d of %d" % (ex_index, len(examples)))
-
-    feature = convert_single_example(ex_index, example, label_list,
-                                     max_seq_length, tokenizer)
-
-    features.append(feature)
-  return features
-
-
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
 
+  if FLAGS.horovod:
+    hvd.init()
+
   processors = {
       "dummy": DummyProcessor
   }
 
-  tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
-                                                FLAGS.init_checkpoint)
+  #tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
+                                                #FLAGS.init_checkpoint)
 
   if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
     raise ValueError(
@@ -636,14 +420,27 @@ def main(_):
 
   label_list = processor.get_labels()
 
-  tokenizer = tokenization.FullTokenizer(
-      vocab_file=FLAGS.vocab_file, do_lower_case=FLAGS.do_lower_case)
+  training_hooks = []
+  global_batch_size = FLAGS.train_batch_size
+  hvd_rank = 0
 
   tpu_cluster_resolver = None
   if FLAGS.use_tpu and FLAGS.tpu_name:
     tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
         FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
 
+  if FLAGS.horovod:
+      tf.logging.info("Multi-GPU training with TF Horovod")
+      tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
+      global_batch_size = FLAGS.train_batch_size * hvd.size()
+      learning_rate = learning_rate * hvd.size()
+      master_process = (hvd.rank() == 0)
+      hvd_rank = hvd.rank()
+      config.gpu_options.allow_growth = True
+      config.gpu_options.visible_device_list = str(hvd.local_rank())
+      if hvd.size() > 1:
+          training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
+
   config = tf.ConfigProto()
   if FLAGS.use_xla: 
     config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
@@ -654,6 +451,7 @@ def main(_):
       model_dir=FLAGS.output_dir,
       session_config=config,
       save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+      log_step_count_steps=1,
       tpu_config=tf.contrib.tpu.TPUConfig(
           iterations_per_loop=FLAGS.iterations_per_loop,
           num_shards=FLAGS.num_tpu_cores,
@@ -663,11 +461,8 @@ def main(_):
   num_train_steps = None
   num_warmup_steps = None
   if FLAGS.do_train:
-    #train_examples = processor.get_train_examples(FLAGS.data_dir)
-    train_examples = processor.get_train_examples()
-    num_train_steps = int(
-        len(train_examples) / FLAGS.train_batch_size * FLAGS.num_train_epochs)
-    num_warmup_steps = int(num_train_steps * FLAGS.warmup_proportion)
+    num_train_steps = 1000
+    num_warmup_steps = 1
 
   # If TPU is not available, this will fall back to normal Estimator on CPU
   # or GPU.
@@ -677,19 +472,24 @@ def main(_):
   import fathomt2t.problems.fprecord_text_problem
   print('FLAGS', FLAGS)
   print('code mapping file', FLAGS.code_mapping_file)
-  problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints'
+  #problem_name = 'icd10_diagnosis_hcpcs_coding_problem_with_hints'
+  problem_name = 'bert_problem'
   hparams_set = 'finetune_bert'
   setup_dataset_flag()
   FLAGS.dataset_split = 'train'
+  if FLAGS.use_fp16:
+      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+      print('Turning on AMP')
+
   hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name)
   #problem = registry.problem(problem_name)
   add_problem_hparams(hparams, problem_name)
   target_modality = hparams.target_modality
   problem = hparams.problem
 
-  if FLAGS.use_fp16:
-      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-      print('Turning on AMP')
+  hparams.data_dir = '/usr/src/bert/scratch_data_dir'
+  ## INGEST
+  problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
 
   model_fn = model_fn_builder(
       bert_config=bert_config,
@@ -700,7 +500,9 @@ def main(_):
       num_warmup_steps=num_warmup_steps,
       use_tpu=FLAGS.use_tpu,
       use_one_hot_embeddings=FLAGS.use_tpu,
-      hparams=hparams)
+      hparams=hparams,
+      hvd=None if not FLAGS.horovod else hvd,
+      use_fp16=FLAGS.use_fp16)
 
   #from tensor2tensor.bin.t2t_trainer import create_run_config
   #run_config = create_run_config(hparams)
@@ -714,65 +516,23 @@ def main(_):
       predict_batch_size=FLAGS.predict_batch_size)
 
   if FLAGS.do_train:
-    train_file = os.path.join(FLAGS.output_dir, "train.tf_record")
-    file_based_convert_examples_to_features(
-        train_examples, label_list, FLAGS.max_seq_length, tokenizer, train_file)
     tf.logging.info("***** Running training *****")
-    tf.logging.info("  Num examples = %d", len(train_examples))
     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
     tf.logging.info("  Num steps = %d", num_train_steps)
-    #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
-    #'''
-    train_input_fn = file_based_input_fn_builder(
-        input_file=train_file,
-        seq_length=FLAGS.max_seq_length,
-        is_training=True,
-        drop_remainder=True)
-    #'''
-    estimator.train(input_fn=train_input_fn, max_steps=num_train_steps)
+    train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
+    training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
+    estimator.train(
+        input_fn=train_input_fn,
+        hooks=training_hooks,
+        max_steps=num_train_steps)
 
   if FLAGS.do_eval:
-    #eval_examples = processor.get_dev_examples(FLAGS.data_dir)
-    eval_examples = processor.get_dev_examples()
-    num_actual_eval_examples = len(eval_examples)
-    if FLAGS.use_tpu:
-      # TPU requires a fixed batch size for all batches, therefore the number
-      # of examples must be a multiple of the batch size, or else examples
-      # will get dropped. So we pad with fake examples which are ignored
-      # later on. These do NOT count towards the metric (all tf.metrics
-      # support a per-instance weight, and these get a weight of 0.0).
-      while len(eval_examples) % FLAGS.eval_batch_size != 0:
-        eval_examples.append(PaddingInputExample())
-
-    eval_file = os.path.join(FLAGS.output_dir, "eval.tf_record")
-    file_based_convert_examples_to_features(
-        eval_examples, label_list, FLAGS.max_seq_length, tokenizer, eval_file)
-
     tf.logging.info("***** Running evaluation *****")
-    tf.logging.info("  Num examples = %d (%d actual, %d padding)",
-                    len(eval_examples), num_actual_eval_examples,
-                    len(eval_examples) - num_actual_eval_examples)
     tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
 
-    # This tells the estimator to run through the entire set.
-    eval_steps = None
-    # However, if running eval on the TPU, you will need to specify the
-    # number of steps.
-    if FLAGS.use_tpu:
-      assert len(eval_examples) % FLAGS.eval_batch_size == 0
-      eval_steps = int(len(eval_examples) // FLAGS.eval_batch_size)
-
-    eval_drop_remainder = True if FLAGS.use_tpu else False
-
     eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams)
-    '''
-    eval_input_fn = file_based_input_fn_builder(
-        input_file=eval_file,
-        seq_length=FLAGS.max_seq_length,
-        is_training=False,
-        drop_remainder=eval_drop_remainder)
-    '''
 
+    eval_steps = 1000
     result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
 
     output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")

From 7f14f2cd4bb2a9af57650d1c9b99b4b9e669b7f3 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 18:34:13 +0000
Subject: [PATCH 16/77] hparams data dir

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index f6a82a735..ce11522bf 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -487,7 +487,7 @@ def main(_):
   target_modality = hparams.target_modality
   problem = hparams.problem
 
-  hparams.data_dir = '/usr/src/bert/scratch_data_dir'
+  hparams.data_dir = 'gs://fathom-dev-210618-workspace-h5bnpfec/FINETUNE_BERT/2019-06-06T07-57-40.391508+00-00/usr.src.bert.finetune_BERT.py/data_dir'
   ## INGEST
   problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
 

From 09f7c025091bf0452150d1f98620cf453abf84c7 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 18:48:54 +0000
Subject: [PATCH 17/77] clean up and move amp

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 125 +-----------------
 1 file changed, 4 insertions(+), 121 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index ce11522bf..d1f721086 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -117,6 +117,10 @@
 
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
+if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    print('Turning on AMP')
+
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):
   def __init__(self, global_batch_size, hvd_rank=-1):
@@ -150,124 +154,6 @@ def after_run(self, run_context, run_values):
     self.total_time += elapsed_secs
     self.count += 1
 
-class InputExample(object):
-  """A single training/test example for simple sequence classification."""
-
-  def __init__(self, guid, text_a, text_b=None, label=None):
-    """Constructs a InputExample.
-
-    Args:
-      guid: Unique id for the example.
-      text_a: string. The untokenized text of the first sequence. For single
-        sequence tasks, only this sequence must be specified.
-      text_b: (Optional) string. The untokenized text of the second sequence.
-        Only must be specified for sequence pair tasks.
-      label: (Optional) string. The label of the example. This should be
-        specified for train and dev examples, but not for test examples.
-    """
-    self.guid = guid
-    self.text_a = text_a
-    self.text_b = text_b
-    self.label = label
-
-
-class PaddingInputExample(object):
-  """Fake example so the num input examples is a multiple of the batch size.
-
-  When running eval/predict on the TPU, we need to pad the number of examples
-  to be a multiple of the batch size, because the TPU requires a fixed batch
-  size. The alternative is to drop the last batch, which is bad because it means
-  the entire output data won't be generated.
-
-  We use this class instead of `None` because treating `None` as padding
-  battches could cause silent errors.
-  """
-
-
-class InputFeatures(object):
-  """A single set of features of data."""
-
-  def __init__(self,
-               input_ids,
-               input_mask,
-               segment_ids,
-               label_id,
-               is_real_example=True):
-    self.input_ids = input_ids
-    self.input_mask = input_mask
-    self.segment_ids = segment_ids
-    self.label_id = label_id
-    self.is_real_example = is_real_example
-
-
-class DataProcessor(object):
-  """Base class for data converters for sequence classification data sets."""
-
-  def get_train_examples(self, data_dir):
-    """Gets a collection of `InputExample`s for the train set."""
-    raise NotImplementedError()
-
-  def get_dev_examples(self, data_dir):
-    """Gets a collection of `InputExample`s for the dev set."""
-    raise NotImplementedError()
-
-  def get_test_examples(self, data_dir):
-    """Gets a collection of `InputExample`s for prediction."""
-    raise NotImplementedError()
-
-  def get_labels(self):
-    """Gets the list of labels for this data set."""
-    raise NotImplementedError()
-
-  @classmethod
-  def _read_tsv(cls, input_file, quotechar=None):
-    """Reads a tab separated value file."""
-    with tf.gfile.Open(input_file, "r") as f:
-      reader = csv.reader(f, delimiter="\t", quotechar=quotechar)
-      lines = []
-      for line in reader:
-        lines.append(line)
-      return lines
-
-
-class DummyProcessor(DataProcessor):
-
-    def get_train_examples(self):
-        return self._create_examples(10, 'train')
-
-    def get_dev_examples(self):
-        return self._create_examples(2, 'dev')
-
-    def get_test_examples(self):
-        return self._create_examples(2, 'test')
-
-    def get_labels(self):
-        return ["0", "1"]
-
-    def _create_examples(self, num_lines, set_type):
-        examples = []
-        for i in range(num_lines):
-            guid = "%s-%d" % (set_type, i)
-            examples.append(InputExample(guid=guid, text_a='dummy dummy, dummy', label='0'))
-        return examples
-
-
-def _truncate_seq_pair(tokens_a, tokens_b, max_length):
-  """Truncates a sequence pair in place to the maximum length."""
-
-  # This is a simple heuristic which will always truncate the longer sequence
-  # one token at a time. This makes more sense than truncating an equal percent
-  # of tokens from each, since if one sequence is very short then each token
-  # that's truncated likely contains more information than a longer sequence.
-  while True:
-    total_length = len(tokens_a) + len(tokens_b)
-    if total_length <= max_length:
-      break
-    if len(tokens_a) > len(tokens_b):
-      tokens_a.pop()
-    else:
-      tokens_b.pop()
-
 
 def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                  labels, num_labels, use_one_hot_embeddings, hparams):
@@ -477,9 +363,6 @@ def main(_):
   hparams_set = 'finetune_bert'
   setup_dataset_flag()
   FLAGS.dataset_split = 'train'
-  if FLAGS.use_fp16:
-      os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-      print('Turning on AMP')
 
   hparams = create_hparams(hparams_set=hparams_set, problem_name=problem_name)
   #problem = registry.problem(problem_name)

From ac331abadea67eb491a2b1a562fe610cb5cf24ff Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 18:55:06 +0000
Subject: [PATCH 18/77] clean

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 27 ++++---------------
 1 file changed, 5 insertions(+), 22 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index d1f721086..6856b939f 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -117,9 +117,6 @@
 
 flags.DEFINE_bool("use_xla", False, "Whether to enable XLA JIT compilation.")
 
-if FLAGS.use_fp16:
-    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
-    print('Turning on AMP')
 
 # report samples/sec, total loss and learning rate during training
 class _LogEvalRunHook(tf.train.SessionRunHook):
@@ -271,16 +268,13 @@ def tpu_scaffold():
 def main(_):
   tf.logging.set_verbosity(tf.logging.INFO)
 
+  if FLAGS.use_fp16:
+    os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
+    print('Turning on AMP')
+
   if FLAGS.horovod:
     hvd.init()
 
-  processors = {
-      "dummy": DummyProcessor
-  }
-
-  #tokenization.validate_case_matches_checkpoint(FLAGS.do_lower_case,
-                                                #FLAGS.init_checkpoint)
-
   if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
     raise ValueError(
         "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
@@ -295,17 +289,6 @@ def main(_):
 
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
-  #task_name = FLAGS.task_name.lower()
-
-  task_name = 'dummy'
-
-  if task_name not in processors:
-    raise ValueError("Task not found: %s" % (task_name))
-
-  processor = processors[task_name]()
-
-  label_list = processor.get_labels()
-
   training_hooks = []
   global_batch_size = FLAGS.train_batch_size
   hvd_rank = 0
@@ -376,7 +359,7 @@ def main(_):
 
   model_fn = model_fn_builder(
       bert_config=bert_config,
-      num_labels=len(label_list),
+      num_labels=10,
       init_checkpoint=FLAGS.init_checkpoint,
       learning_rate=FLAGS.learning_rate,
       num_train_steps=num_train_steps,

From a29bf2b6f71ee509a6b501ae3ae34b444b0e90ce Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 20:28:47 +0000
Subject: [PATCH 19/77] use classifier last layer

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 27 +++++++++++++++++--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 6856b939f..d794f9629 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -175,7 +175,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   #batch_size = extended_batch_size / chunk_size
 
   #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
-
+  '''
   body_outputs = tf.expand_dims(body_outputs, axis=-2)
   features = {
     'targets': labels
@@ -185,10 +185,33 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   top_out = target_modality.top(body_outputs, features)
 
   num, den = target_modality.loss(top_out, labels)
-  print('num, den', num, den)
   loss = num / den
 
   return loss, top_out['logits']
+  '''
+  hidden_size = body_outputs.shape[-1].value
+
+  output_weights = tf.get_variable(
+      "output_weights", [num_labels, hidden_size],
+      initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+  output_bias = tf.get_variable(
+      "output_bias", [num_labels], initializer=tf.zeros_initializer())
+
+  with tf.variable_scope("loss"):
+    if is_training:
+      # I.e., 0.1 dropout
+      body_outputs = tf.nn.dropout(body_outputs, keep_prob=0.9)
+
+    logits = tf.matmul(body_outputs, output_weights, transpose_b=True)
+    logits = tf.nn.bias_add(logits, output_bias)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+
+    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
+
+    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+    loss = tf.reduce_mean(per_example_loss)
+    return loss, logits
 
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,

From adcda89694eaa9d7fed9ed606e15f0321ba0fea4 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 20:35:27 +0000
Subject: [PATCH 20/77] fix

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index d794f9629..f0131c124 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -189,6 +189,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
   return loss, top_out['logits']
   '''
+  body_outputs = model.get_pooled_output()
   hidden_size = body_outputs.shape[-1].value
 
   output_weights = tf.get_variable(

From a7e761e069ed6a48651d174703657825694a04ef Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 20:51:28 +0000
Subject: [PATCH 21/77] our top

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index f0131c124..14e819f29 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -175,7 +175,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   #batch_size = extended_batch_size / chunk_size
 
   #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
-  '''
+  #'''
   body_outputs = tf.expand_dims(body_outputs, axis=-2)
   features = {
     'targets': labels
@@ -213,6 +213,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
     loss = tf.reduce_mean(per_example_loss)
     return loss, logits
+  '''
 
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,

From 07c500cec7d37add95d4b350be396991c4864b9a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 21:26:02 +0000
Subject: [PATCH 22/77] lr

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 14e819f29..fca7122c8 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -323,6 +323,7 @@ def main(_):
     tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
         FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
 
+  learning_rate = FLAGS.learning_rate
   if FLAGS.horovod:
       tf.logging.info("Multi-GPU training with TF Horovod")
       tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
@@ -386,7 +387,7 @@ def main(_):
       bert_config=bert_config,
       num_labels=10,
       init_checkpoint=FLAGS.init_checkpoint,
-      learning_rate=FLAGS.learning_rate,
+      learning_rate=learning_rate,
       num_train_steps=num_train_steps,
       num_warmup_steps=num_warmup_steps,
       use_tpu=FLAGS.use_tpu,

From 0215da099a4f62c8d47bed750ee24952ef6e434e Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 21:29:07 +0000
Subject: [PATCH 23/77] init config earlier

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index fca7122c8..8957bd96f 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -323,6 +323,7 @@ def main(_):
     tpu_cluster_resolver = tf.contrib.cluster_resolver.TPUClusterResolver(
         FLAGS.tpu_name, zone=FLAGS.tpu_zone, project=FLAGS.gcp_project)
 
+  config = tf.ConfigProto()
   learning_rate = FLAGS.learning_rate
   if FLAGS.horovod:
       tf.logging.info("Multi-GPU training with TF Horovod")
@@ -336,7 +337,6 @@ def main(_):
       if hvd.size() > 1:
           training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
 
-  config = tf.ConfigProto()
   if FLAGS.use_xla: 
     config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2

From 65443d0981a7849c8b79c9d2ec26121d9391cd57 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 22:37:41 +0000
Subject: [PATCH 24/77] skip generate tfproto

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 8957bd96f..1f8e5b15b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -381,7 +381,7 @@ def main(_):
 
   hparams.data_dir = 'gs://fathom-dev-210618-workspace-h5bnpfec/FINETUNE_BERT/2019-06-06T07-57-40.391508+00-00/usr.src.bert.finetune_BERT.py/data_dir'
   ## INGEST
-  problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
+  #problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
 
   model_fn = model_fn_builder(
       bert_config=bert_config,

From 246b465d3415fe926923c64deb79a976dcb5caf7 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 6 Jun 2019 23:57:10 +0000
Subject: [PATCH 25/77] squad

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 48 ++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 1f8e5b15b..351aeaae6 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -175,7 +175,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   #batch_size = extended_batch_size / chunk_size
 
   #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
-  #'''
+  '''
   body_outputs = tf.expand_dims(body_outputs, axis=-2)
   features = {
     'targets': labels
@@ -189,6 +189,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
   return loss, top_out['logits']
   '''
+  # classifier
   body_outputs = model.get_pooled_output()
   hidden_size = body_outputs.shape[-1].value
 
@@ -214,7 +215,52 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
     loss = tf.reduce_mean(per_example_loss)
     return loss, logits
   '''
+  #'''
+  # squad
+  final_hidden = model.get_sequence_output()
+
+  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
+  batch_size = final_hidden_shape[0]
+  seq_length = final_hidden_shape[1]
+  hidden_size = final_hidden_shape[2]
+
+  output_weights = tf.get_variable(
+      "cls/squad/output_weights", [2, hidden_size],
+      initializer=tf.truncated_normal_initializer(stddev=0.02))
+
+  output_bias = tf.get_variable(
+      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())
+
+  final_hidden_matrix = tf.reshape(final_hidden,
+                                   [batch_size * seq_length, hidden_size])
+  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
+  logits = tf.nn.bias_add(logits, output_bias)
 
+  logits = tf.reshape(logits, [batch_size, seq_length, 2])
+  logits = tf.transpose(logits, [2, 0, 1])
+
+  unstacked_logits = tf.unstack(logits, axis=0)
+
+  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])
+
+  def compute_loss(logits, positions):
+    one_hot_positions = tf.one_hot(
+        positions, depth=seq_length, dtype=tf.float32)
+    log_probs = tf.nn.log_softmax(logits, axis=-1)
+    loss = -tf.reduce_mean(
+        tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
+    return loss
+
+  start_positions = [label_ids]
+  end_positions = [label_ids]
+
+  start_loss = compute_loss(start_logits, start_positions)
+  end_loss = compute_loss(end_lotis, end_positions)
+
+  total_loss = (start_loss + end_loss) / 2.0
+
+  return total_loss, start_logits
+  #'''
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,

From 62fa7e846ed3d03c114e1ae15057843bee41267c Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 02:07:29 +0000
Subject: [PATCH 26/77] seq length and comment

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 351aeaae6..c1940b853 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -188,7 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   loss = num / den
 
   return loss, top_out['logits']
-  '''
+
   # classifier
   body_outputs = model.get_pooled_output()
   hidden_size = body_outputs.shape[-1].value
@@ -243,6 +243,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
   (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])
 
+  seq_length = modeling.get_shape_list(input_ids)[1]
   def compute_loss(logits, positions):
     one_hot_positions = tf.one_hot(
         positions, depth=seq_length, dtype=tf.float32)

From e774acc02cb030dd1ea8d27d05d267d8d9a768ab Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 02:19:36 +0000
Subject: [PATCH 27/77] typo

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index c1940b853..eca542000 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -256,7 +256,7 @@ def compute_loss(logits, positions):
   end_positions = [label_ids]
 
   start_loss = compute_loss(start_logits, start_positions)
-  end_loss = compute_loss(end_lotis, end_positions)
+  end_loss = compute_loss(end_logits, end_positions)
 
   total_loss = (start_loss + end_loss) / 2.0
 

From a905407de4aa9d91b7b2d6c9313dd83d57111ffa Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 02:22:26 +0000
Subject: [PATCH 28/77] typo

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index eca542000..531c790d9 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -252,8 +252,8 @@ def compute_loss(logits, positions):
         tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
     return loss
 
-  start_positions = [label_ids]
-  end_positions = [label_ids]
+  start_positions = [labels]
+  end_positions = [labels]
 
   start_loss = compute_loss(start_logits, start_positions)
   end_loss = compute_loss(end_logits, end_positions)

From 1cc0d47bcde9952578eb39430d8c67b7d1930b11 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 02:35:04 +0000
Subject: [PATCH 29/77] compute type 32

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 531c790d9..8218ad031 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -165,7 +165,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       input_mask=input_mask,
       token_type_ids=segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
-      compute_type=tf.float16 if FLAGS.use_fp16 else tf.float32)
+      compute_type=tf.float32)
 
   # [B, 384, D]
   body_outputs = model.get_sequence_output()

From 77f789a2bca2968ee7bb4366f2924cd16201b251 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 03:00:39 +0000
Subject: [PATCH 30/77] move loss outside

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 45 ++++++++++---------
 1 file changed, 24 insertions(+), 21 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 8218ad031..4e32c632b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -166,6 +166,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       token_type_ids=segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
       compute_type=tf.float32)
+  '''
 
   # [B, 384, D]
   body_outputs = model.get_sequence_output()
@@ -175,7 +176,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   #batch_size = extended_batch_size / chunk_size
 
   #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
-  '''
   body_outputs = tf.expand_dims(body_outputs, axis=-2)
   features = {
     'targets': labels
@@ -242,25 +242,8 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   unstacked_logits = tf.unstack(logits, axis=0)
 
   (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])
+  return start_logits, end_logits
 
-  seq_length = modeling.get_shape_list(input_ids)[1]
-  def compute_loss(logits, positions):
-    one_hot_positions = tf.one_hot(
-        positions, depth=seq_length, dtype=tf.float32)
-    log_probs = tf.nn.log_softmax(logits, axis=-1)
-    loss = -tf.reduce_mean(
-        tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
-    return loss
-
-  start_positions = [labels]
-  end_positions = [labels]
-
-  start_loss = compute_loss(start_logits, start_positions)
-  end_loss = compute_loss(end_logits, end_positions)
-
-  total_loss = (start_loss + end_loss) / 2.0
-
-  return total_loss, start_logits
   #'''
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
@@ -282,7 +265,8 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 
     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
-    (total_loss, logits) = create_model(
+    #(total_loss, logits) = create_model(
+    (start_logits, end_logits) = create_model(
         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
         num_labels, use_one_hot_embeddings, hparams)
 
@@ -311,7 +295,25 @@ def tpu_scaffold():
 
     output_spec = None
     if mode == tf.estimator.ModeKeys.TRAIN:
+    ####
+      seq_length = modeling.get_shape_list(input_ids)[1]
+      def compute_loss(logits, positions):
+        one_hot_positions = tf.one_hot(
+            positions, depth=seq_length, dtype=tf.float32)
+        log_probs = tf.nn.log_softmax(logits, axis=-1)
+        loss = -tf.reduce_mean(
+            tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
+        return loss
+
+      start_positions = [labels]
+      end_positions = [labels]
+
+      start_loss = compute_loss(start_logits, start_positions)
+      end_loss = compute_loss(end_logits, end_positions)
 
+      total_loss = (start_loss + end_loss) / 2.0
+
+      ###
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu,
           hvd, amp=use_fp16)
@@ -384,8 +386,9 @@ def main(_):
       if hvd.size() > 1:
           training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
 
-  if FLAGS.use_xla: 
+  if FLAGS.use_xla:
     config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
+
   is_per_host = tf.contrib.tpu.InputPipelineConfig.PER_HOST_V2
   run_config = tf.contrib.tpu.RunConfig(
       cluster=tpu_cluster_resolver,

From 2841acc222b1130d81bd38fa0126e9f693c01f36 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 03:13:15 +0000
Subject: [PATCH 31/77] barrier

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 4e32c632b..8044aba0e 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -463,6 +463,11 @@ def main(_):
     tf.logging.info("  Num steps = %d", num_train_steps)
     train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
+    if FLAGS.horovod:
+        barrier.hvd.allreduce(tf.constant(0))
+        with tf.Session(config=config) as sess:
+            sess.run(barrier)
+
     estimator.train(
         input_fn=train_input_fn,
         hooks=training_hooks,

From ee9cbd2e757714f8281f0fee820ccfedd5ba235d Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 03:17:12 +0000
Subject: [PATCH 32/77] typo

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 8044aba0e..8a568866e 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -464,7 +464,7 @@ def main(_):
     train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
     if FLAGS.horovod:
-        barrier.hvd.allreduce(tf.constant(0))
+        barrier = hvd.allreduce(tf.constant(0))
         with tf.Session(config=config) as sess:
             sess.run(barrier)
 

From c49424699279e8ddea5aff8971cf22e7d2f12ec0 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 18:40:05 +0000
Subject: [PATCH 33/77] hvd input fn builder

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 8a568866e..ceab7ae7f 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -461,7 +461,9 @@ def main(_):
     tf.logging.info("***** Running training *****")
     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
     tf.logging.info("  Num steps = %d", num_train_steps)
-    train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
+    #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
+    train_input_fn = problem.horovod_input_fn_builder(
+        is_training=True, hvd=None if not FLAGS.horovode else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))

From 644583301de45a8ebe542fc5ffb0e224257b1fde Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 18:58:41 +0000
Subject: [PATCH 34/77] typo

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index ceab7ae7f..cefa8c4d1 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -463,7 +463,7 @@ def main(_):
     tf.logging.info("  Num steps = %d", num_train_steps)
     #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
     train_input_fn = problem.horovod_input_fn_builder(
-        is_training=True, hvd=None if not FLAGS.horovode else hvd)
+        is_training=True, hvd=None if not FLAGS.horovod else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))

From 420a0af778a50baf417636b3cfd27b8f194877f0 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 20:27:39 +0000
Subject: [PATCH 35/77] try mock start positions

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index cefa8c4d1..dccb51eaa 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -305,8 +305,7 @@ def compute_loss(logits, positions):
             tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
         return loss
 
-      start_positions = [labels]
-      end_positions = [labels]
+      start_positions = end_positions = tf.ones_like([tf.shape(start_logits)[0], 1])
 
       start_loss = compute_loss(start_logits, start_positions)
       end_loss = compute_loss(end_logits, end_positions)

From a82b153854822260c11ed6f7beb534cddbed4bc1 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 20:37:20 +0000
Subject: [PATCH 36/77] our top

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index dccb51eaa..cd872acbe 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -166,7 +166,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       token_type_ids=segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
       compute_type=tf.float32)
-  '''
+  #'''
 
   # [B, 384, D]
   body_outputs = model.get_sequence_output()
@@ -188,7 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   loss = num / den
 
   return loss, top_out['logits']
-
+  '''
   # classifier
   body_outputs = model.get_pooled_output()
   hidden_size = body_outputs.shape[-1].value
@@ -214,8 +214,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
     per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
     loss = tf.reduce_mean(per_example_loss)
     return loss, logits
-  '''
-  #'''
+
   # squad
   final_hidden = model.get_sequence_output()
 
@@ -305,7 +304,8 @@ def compute_loss(logits, positions):
             tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
         return loss
 
-      start_positions = end_positions = tf.ones_like([tf.shape(start_logits)[0], 1])
+      start_positions = [labels]
+      end_positions = [labels]
 
       start_loss = compute_loss(start_logits, start_positions)
       end_loss = compute_loss(end_logits, end_positions)

From f75aafadb4ac00e9aec89fd0d348e78c848cf759 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 20:41:10 +0000
Subject: [PATCH 37/77] clean up

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 74 -------------------
 1 file changed, 74 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index cd872acbe..1a8d842dc 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -188,62 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   loss = num / den
 
   return loss, top_out['logits']
-  '''
-  # classifier
-  body_outputs = model.get_pooled_output()
-  hidden_size = body_outputs.shape[-1].value
 
-  output_weights = tf.get_variable(
-      "output_weights", [num_labels, hidden_size],
-      initializer=tf.truncated_normal_initializer(stddev=0.02))
-
-  output_bias = tf.get_variable(
-      "output_bias", [num_labels], initializer=tf.zeros_initializer())
-
-  with tf.variable_scope("loss"):
-    if is_training:
-      # I.e., 0.1 dropout
-      body_outputs = tf.nn.dropout(body_outputs, keep_prob=0.9)
-
-    logits = tf.matmul(body_outputs, output_weights, transpose_b=True)
-    logits = tf.nn.bias_add(logits, output_bias)
-    log_probs = tf.nn.log_softmax(logits, axis=-1)
-
-    one_hot_labels = tf.one_hot(labels, depth=num_labels, dtype=tf.float32)
-
-    per_example_loss = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
-    loss = tf.reduce_mean(per_example_loss)
-    return loss, logits
-
-  # squad
-  final_hidden = model.get_sequence_output()
-
-  final_hidden_shape = modeling.get_shape_list(final_hidden, expected_rank=3)
-  batch_size = final_hidden_shape[0]
-  seq_length = final_hidden_shape[1]
-  hidden_size = final_hidden_shape[2]
-
-  output_weights = tf.get_variable(
-      "cls/squad/output_weights", [2, hidden_size],
-      initializer=tf.truncated_normal_initializer(stddev=0.02))
-
-  output_bias = tf.get_variable(
-      "cls/squad/output_bias", [2], initializer=tf.zeros_initializer())
-
-  final_hidden_matrix = tf.reshape(final_hidden,
-                                   [batch_size * seq_length, hidden_size])
-  logits = tf.matmul(final_hidden_matrix, output_weights, transpose_b=True)
-  logits = tf.nn.bias_add(logits, output_bias)
-
-  logits = tf.reshape(logits, [batch_size, seq_length, 2])
-  logits = tf.transpose(logits, [2, 0, 1])
-
-  unstacked_logits = tf.unstack(logits, axis=0)
-
-  (start_logits, end_logits) = (unstacked_logits[0], unstacked_logits[1])
-  return start_logits, end_logits
-
-  #'''
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
@@ -294,25 +239,6 @@ def tpu_scaffold():
 
     output_spec = None
     if mode == tf.estimator.ModeKeys.TRAIN:
-    ####
-      seq_length = modeling.get_shape_list(input_ids)[1]
-      def compute_loss(logits, positions):
-        one_hot_positions = tf.one_hot(
-            positions, depth=seq_length, dtype=tf.float32)
-        log_probs = tf.nn.log_softmax(logits, axis=-1)
-        loss = -tf.reduce_mean(
-            tf.reduce_sum(one_hot_positions * log_probs, axis=-1))
-        return loss
-
-      start_positions = [labels]
-      end_positions = [labels]
-
-      start_loss = compute_loss(start_logits, start_positions)
-      end_loss = compute_loss(end_logits, end_positions)
-
-      total_loss = (start_loss + end_loss) / 2.0
-
-      ###
       train_op = optimization.create_optimizer(
           total_loss, learning_rate, num_train_steps, num_warmup_steps, use_tpu,
           hvd, amp=use_fp16)

From c5385b1ffb025794d287a819ecbae4bc67e1ccb7 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 21:01:19 +0000
Subject: [PATCH 38/77] loss

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 1a8d842dc..0c35f4ef5 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -209,8 +209,7 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 
     is_training = (mode == tf.estimator.ModeKeys.TRAIN)
 
-    #(total_loss, logits) = create_model(
-    (start_logits, end_logits) = create_model(
+    (total_loss, logits) = create_model(
         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
         num_labels, use_one_hot_embeddings, hparams)
 

From c4f8e0a17fab568209ffcc0426a1fd28f7b82aae Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 21:27:51 +0000
Subject: [PATCH 39/77] hparams

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 0c35f4ef5..18a802502 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -387,7 +387,8 @@ def main(_):
     tf.logging.info("  Num steps = %d", num_train_steps)
     #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
     train_input_fn = problem.horovod_input_fn_builder(
-        is_training=True, hvd=None if not FLAGS.horovod else hvd)
+        mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
+        hvd=None if not FLAGS.horovod else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))

From 1ed1c76e6da257a71d8e37ecebb08644fa6ab41a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 21:36:26 +0000
Subject: [PATCH 40/77] hvd for partitioning

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 18a802502..d5f5a7058 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -385,10 +385,11 @@ def main(_):
     tf.logging.info("***** Running training *****")
     tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
     tf.logging.info("  Num steps = %d", num_train_steps)
-    #train_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.TRAIN, hparams)
-    train_input_fn = problem.horovod_input_fn_builder(
-        mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
-        hvd=None if not FLAGS.horovod else hvd)
+    train_input_fn = problem.make_estimator_input_fn(
+        tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
+    #train_input_fn = problem.horovod_input_fn_builder(
+        #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
+        #hvd=None if not FLAGS.horovod else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))

From 0f95ae655bf11cd38cfee639c38ca6edfd80a27a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 7 Jun 2019 22:29:59 +0000
Subject: [PATCH 41/77] flags data dir

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index d5f5a7058..30613dde7 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -353,7 +353,7 @@ def main(_):
   target_modality = hparams.target_modality
   problem = hparams.problem
 
-  hparams.data_dir = 'gs://fathom-dev-210618-workspace-h5bnpfec/FINETUNE_BERT/2019-06-06T07-57-40.391508+00-00/usr.src.bert.finetune_BERT.py/data_dir'
+  hparams.data_dir = FLAGS.data_dir
   ## INGEST
   #problem.generate_data(FLAGS.data_dir, FLAGS.tmp_dir)
 

From 0e28db1e36b6d7e956f0d1f70b77339b302633b0 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 8 Jun 2019 04:13:24 +0000
Subject: [PATCH 42/77] print out ini

---
 TensorFlow/LanguageModeling/BERT/run_squad.py | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/run_squad.py b/TensorFlow/LanguageModeling/BERT/run_squad.py
index ba45d923f..f5a2c56d5 100644
--- a/TensorFlow/LanguageModeling/BERT/run_squad.py
+++ b/TensorFlow/LanguageModeling/BERT/run_squad.py
@@ -679,14 +679,14 @@ def tpu_scaffold():
       else:
         tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
     
-    if FLAGS.verbose_logging:
-        tf.logging.info("**** Trainable Variables ****")
-        for var in tvars:
-          init_string = ""
-          if var.name in initialized_variable_names:
-            init_string = ", *INIT_FROM_CKPT*"
-          tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
-                          init_string)
+    #if FLAGS.verbose_logging:
+    tf.logging.info("**** Trainable Variables ****")
+    for var in tvars:
+      init_string = ""
+      if var.name in initialized_variable_names:
+        init_string = ", *INIT_FROM_CKPT*"
+      tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape,
+                      init_string)
 
 
     output_spec = None

From fe8b8253a05798f02d15977724919eefc92396cb Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 8 Jun 2019 04:17:17 +0000
Subject: [PATCH 43/77] oom hook

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 30613dde7..2f9db1150 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -391,6 +391,14 @@ def main(_):
         #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
         #hvd=None if not FLAGS.horovod else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
+
+    class OomReportingHook(tf.train.SessionRunHook):
+        def before_run(self, run_context):
+            return tf.train.SessionRunArgs(fetches=[],  # no extra fetches
+                                  options=tf.RunOptions(
+                                      report_tensor_allocations_upon_oom=True))
+
+    training_hooks.append(OomReportingHook)
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))
         with tf.Session(config=config) as sess:

From df3c976e6caf64661428edac32e1ae2ead772846 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 8 Jun 2019 06:18:51 +0000
Subject: [PATCH 44/77] move out

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 2f9db1150..c966d7e6b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -152,6 +152,13 @@ def after_run(self, run_context, run_values):
     self.count += 1
 
 
+class _OomReportingHook(tf.train.SessionRunHook):
+    def before_run(self, run_context):
+        return tf.train.SessionRunArgs(fetches=[],  # no extra fetches
+                              options=tf.RunOptions(
+                                  report_tensor_allocations_upon_oom=True))
+
+
 def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                  labels, num_labels, use_one_hot_embeddings, hparams):
   """Creates a classification model."""
@@ -392,13 +399,7 @@ def main(_):
         #hvd=None if not FLAGS.horovod else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
 
-    class OomReportingHook(tf.train.SessionRunHook):
-        def before_run(self, run_context):
-            return tf.train.SessionRunArgs(fetches=[],  # no extra fetches
-                                  options=tf.RunOptions(
-                                      report_tensor_allocations_upon_oom=True))
-
-    training_hooks.append(OomReportingHook)
+    training_hooks.append(_OomReportingHook)
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))
         with tf.Session(config=config) as sess:

From d50367d2f7ee7aeaff295f92d76c596dcfd75e64 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 8 Jun 2019 07:20:33 +0000
Subject: [PATCH 45/77] instantiate

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index c966d7e6b..891298eb9 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -399,7 +399,7 @@ def main(_):
         #hvd=None if not FLAGS.horovod else hvd)
     training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
 
-    training_hooks.append(_OomReportingHook)
+    training_hooks.append(_OomReportingHook())
     if FLAGS.horovod:
         barrier = hvd.allreduce(tf.constant(0))
         with tf.Session(config=config) as sess:

From 8501f1487f67cf058024d08dba5b5eafd7cc23aa Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 8 Jun 2019 08:02:33 +0000
Subject: [PATCH 46/77] else

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 891298eb9..7432620d1 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -276,6 +276,8 @@ def main(_):
   if FLAGS.use_fp16:
     os.environ["TF_ENABLE_AUTO_MIXED_PRECISION_GRAPH_REWRITE"] = "1"
     print('Turning on AMP')
+  else:
+    print('NOT Turning on AMP')
 
   if FLAGS.horovod:
     hvd.init()

From 09fc5fd9edf10f235774d719ffd3236f8e2bfe92 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Wed, 12 Jun 2019 21:04:34 +0000
Subject: [PATCH 47/77] fix

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 7432620d1..f07923262 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -173,7 +173,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       token_type_ids=segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
       compute_type=tf.float32)
-  #'''
 
   # [B, 384, D]
   body_outputs = model.get_sequence_output()
@@ -184,12 +183,10 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
   #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
   body_outputs = tf.expand_dims(body_outputs, axis=-2)
-  features = {
-    'targets': labels
-  }
-  labels = tf.reshape(labels, [tf.shape(labels)[0], 1, 1, 1])
 
-  top_out = target_modality.top(body_outputs, features)
+  top_out = target_modality.top(body_outputs, None)
+
+  #labels = tf.expand_dims(tf.expand_dims(labels, axis=-1), axis=-1)
 
   num, den = target_modality.loss(top_out, labels)
   loss = num / den

From 3aa196e0c9d00a5b97c6704630ab4f779db02c1a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 13 Jun 2019 02:32:35 +0000
Subject: [PATCH 48/77] eval train loop

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 106 +++++++++---------
 1 file changed, 51 insertions(+), 55 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index f07923262..7a24c44d3 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -56,10 +56,6 @@
     "Sequences longer than this will be truncated, and sequences shorter "
     "than this will be padded.")
 
-flags.DEFINE_bool("do_train", False, "Whether to run training.")
-
-flags.DEFINE_bool("do_eval", False, "Whether to run eval on the dev set.")
-
 flags.DEFINE_bool(
     "do_predict", False,
     "Whether to run the model in inference mode on the test set.")
@@ -186,8 +182,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
   top_out = target_modality.top(body_outputs, None)
 
-  #labels = tf.expand_dims(tf.expand_dims(labels, axis=-1), axis=-1)
-
   num, den = target_modality.loss(top_out, labels)
   loss = num / den
 
@@ -196,7 +190,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
 def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
-                     use_one_hot_embeddings, hparams, hvd=None, use_fp16=False):
+                     use_one_hot_embeddings, hparams, problem, hvd=None, use_fp16=False):
   """Returns `model_fn` closure for TPUEstimator."""
 
   def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
@@ -252,10 +246,15 @@ def tpu_scaffold():
           train_op=train_op,
           scaffold_fn=scaffold_fn)
     elif mode == tf.estimator.ModeKeys.EVAL:
+      #logits.update({'labels': labels})
+      eval_metrics = lambda logits, labels: {
+          name: call(logits, labels)
+          for name, call in problem.all_metrics_fns.items()
+          if name in problem.eval_metrics()}
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
           loss=total_loss,
-          eval_metrics=problem.eval_metrics,
+          eval_metrics=(eval_metrics, [logits, labels]),
           scaffold_fn=scaffold_fn)
     else:
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
@@ -279,10 +278,6 @@ def main(_):
   if FLAGS.horovod:
     hvd.init()
 
-  if not FLAGS.do_train and not FLAGS.do_eval and not FLAGS.do_predict:
-    raise ValueError(
-        "At least one of `do_train`, `do_eval` or `do_predict' must be True.")
-
   bert_config = modeling.BertConfig.from_json_file(FLAGS.bert_config_file)
 
   if FLAGS.max_seq_length > bert_config.max_position_embeddings:
@@ -333,11 +328,12 @@ def main(_):
           per_host_input_for_training=is_per_host))
 
   train_examples = None
-  num_train_steps = None
-  num_warmup_steps = None
-  if FLAGS.do_train:
-    num_train_steps = 1000
-    num_warmup_steps = 1
+  num_train_steps = 2000
+  num_warmup_steps = 1
+  eval_frequency_steps = 100
+  assert num_train_steps % eval_frequency_steps == 0
+  train_eval_iterations = num_train_steps // eval_frequency_steps
+  eval_steps = 100
 
   # If TPU is not available, this will fall back to normal Estimator on CPU
   # or GPU.
@@ -365,7 +361,7 @@ def main(_):
 
   model_fn = model_fn_builder(
       bert_config=bert_config,
-      num_labels=10,
+      num_labels=problem.label_manager,
       init_checkpoint=FLAGS.init_checkpoint,
       learning_rate=learning_rate,
       num_train_steps=num_train_steps,
@@ -373,6 +369,7 @@ def main(_):
       use_tpu=FLAGS.use_tpu,
       use_one_hot_embeddings=FLAGS.use_tpu,
       hparams=hparams,
+      problem=problem,
       hvd=None if not FLAGS.horovod else hvd,
       use_fp16=FLAGS.use_fp16)
 
@@ -387,43 +384,42 @@ def main(_):
       eval_batch_size=FLAGS.eval_batch_size,
       predict_batch_size=FLAGS.predict_batch_size)
 
-  if FLAGS.do_train:
-    tf.logging.info("***** Running training *****")
-    tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
-    tf.logging.info("  Num steps = %d", num_train_steps)
-    train_input_fn = problem.make_estimator_input_fn(
-        tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
-    #train_input_fn = problem.horovod_input_fn_builder(
-        #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
-        #hvd=None if not FLAGS.horovod else hvd)
-    training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
-
-    training_hooks.append(_OomReportingHook())
-    if FLAGS.horovod:
-        barrier = hvd.allreduce(tf.constant(0))
-        with tf.Session(config=config) as sess:
-            sess.run(barrier)
-
-    estimator.train(
-        input_fn=train_input_fn,
-        hooks=training_hooks,
-        max_steps=num_train_steps)
-
-  if FLAGS.do_eval:
-    tf.logging.info("***** Running evaluation *****")
-    tf.logging.info("  Batch size = %d", FLAGS.eval_batch_size)
-
-    eval_input_fn = problem.make_estimator_input_fn(tf.estimator.ModeKeys.EVAL, hparams)
-
-    eval_steps = 1000
-    result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
-
-    output_eval_file = os.path.join(FLAGS.output_dir, "eval_results.txt")
-    with tf.gfile.GFile(output_eval_file, "w") as writer:
-      tf.logging.info("***** Eval results *****")
-      for key in sorted(result.keys()):
-        tf.logging.info("  %s = %s", key, str(result[key]))
-        writer.write("%s = %s\n" % (key, str(result[key])))
+  tf.logging.info("***** Running training *****")
+  tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+  tf.logging.info("  Num steps = %d", num_train_steps)
+  train_input_fn = problem.make_estimator_input_fn(
+      tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
+  #train_input_fn = problem.horovod_input_fn_builder(
+      #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
+      #hvd=None if not FLAGS.horovod else hvd)
+  training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
+
+  #training_hooks.append(_OomReportingHook())
+
+  eval_input_fn = problem.make_estimator_input_fn(
+      tf.estimator.ModeKeys.EVAL,
+      hparams,
+      None if not FLAGS.horovod else hvd)
+
+  if FLAGS.horovod:
+      barrier = hvd.allreduce(tf.constant(0))
+      with tf.Session(config=config) as sess:
+          sess.run(barrier)
+
+  # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
+  for n in range(train_eval_iterations):
+      if not FLAGS.horovod or hvd.rank() != 0:
+          estimator.train(
+              input_fn=train_input_fn,
+              hooks=training_hooks,
+              # TODO: LR dependent on train steps, are we resetting this every time then?
+              steps=num_train_steps)
+
+      if not FLAGS.horovod or hvd.rank() == 0:
+          result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+          tf.logging.info("***** Eval results *****")
+          for key in sorted(result.keys()):
+              tf.logging.info("  %s = %s", key, str(result[key]))
 
 
 if __name__ == "__main__":

From 081a1a720c0c35e313dc923ee82c981ef8457567 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 13 Jun 2019 02:33:51 +0000
Subject: [PATCH 49/77] do not need num labels

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 7a24c44d3..ca8d17e6b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -156,7 +156,7 @@ def before_run(self, run_context):
 
 
 def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
-                 labels, num_labels, use_one_hot_embeddings, hparams):
+                 labels, use_one_hot_embeddings, hparams):
   """Creates a classification model."""
   target_modality = hparams.problem_hparams.target_modality
   input_modality = hparams.problem_hparams.input_modality
@@ -188,7 +188,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   return loss, top_out['logits']
 
 
-def model_fn_builder(bert_config, num_labels, init_checkpoint, learning_rate,
+def model_fn_builder(bert_config, init_checkpoint, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
                      use_one_hot_embeddings, hparams, problem, hvd=None, use_fp16=False):
   """Returns `model_fn` closure for TPUEstimator."""
@@ -209,7 +209,7 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
 
     (total_loss, logits) = create_model(
         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
-        num_labels, use_one_hot_embeddings, hparams)
+        use_one_hot_embeddings, hparams)
 
     tvars = tf.trainable_variables()
     initialized_variable_names = {}
@@ -361,7 +361,6 @@ def main(_):
 
   model_fn = model_fn_builder(
       bert_config=bert_config,
-      num_labels=problem.label_manager,
       init_checkpoint=FLAGS.init_checkpoint,
       learning_rate=learning_rate,
       num_train_steps=num_train_steps,

From 9ce757ded4bec88623bed32919787f41c3bdbc91 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 13 Jun 2019 19:27:33 +0000
Subject: [PATCH 50/77] barrier between train and eval

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 33 ++++++++++---------
 1 file changed, 18 insertions(+), 15 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index ca8d17e6b..e8b0d1335 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -383,9 +383,10 @@ def main(_):
       eval_batch_size=FLAGS.eval_batch_size,
       predict_batch_size=FLAGS.predict_batch_size)
 
-  tf.logging.info("***** Running training *****")
-  tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
-  tf.logging.info("  Num steps = %d", num_train_steps)
+  tf.logging.info("***** Running training *****",
+                  hvd.rank() if FLAGS.horovod else 'no hvd', )
+  #tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
+  #tf.logging.info("  Num steps = %d", num_train_steps)
   train_input_fn = problem.make_estimator_input_fn(
       tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
   #train_input_fn = problem.horovod_input_fn_builder(
@@ -400,22 +401,24 @@ def main(_):
       hparams,
       None if not FLAGS.horovod else hvd)
 
-  if FLAGS.horovod:
-      barrier = hvd.allreduce(tf.constant(0))
-      with tf.Session(config=config) as sess:
-          sess.run(barrier)
-
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   for n in range(train_eval_iterations):
-      if not FLAGS.horovod or hvd.rank() != 0:
-          estimator.train(
-              input_fn=train_input_fn,
-              hooks=training_hooks,
-              # TODO: LR dependent on train steps, are we resetting this every time then?
-              steps=num_train_steps)
+      estimator.train(
+          input_fn=train_input_fn,
+          hooks=training_hooks,
+          # TODO: LR dependent on train steps, are we resetting this every time then?
+          steps=num_train_steps)
+
+      if FLAGS.horovod:
+          barrier = hvd.allreduce(tf.constant(0))
+          with tf.Session(config=config) as sess:
+              sess.run(barrier)
 
       if not FLAGS.horovod or hvd.rank() == 0:
-          result = estimator.evaluate(input_fn=eval_input_fn, steps=eval_steps)
+          result = estimator.evaluate(
+                input_fn=eval_input_fn,
+                steps=eval_steps,
+                hooks=[_LogEvalRunHook()
           tf.logging.info("***** Eval results *****")
           for key in sorted(result.keys()):
               tf.logging.info("  %s = %s", key, str(result[key]))

From 7bac20c0ed891d771006e8a49a1225de601ac29b Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 13 Jun 2019 22:43:53 +0000
Subject: [PATCH 51/77] fix

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index e8b0d1335..0c0d4c63d 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -418,7 +418,7 @@ def main(_):
           result = estimator.evaluate(
                 input_fn=eval_input_fn,
                 steps=eval_steps,
-                hooks=[_LogEvalRunHook()
+                hooks=[_LogEvalRunHook(FLAGS.eval_batch_size)])
           tf.logging.info("***** Eval results *****")
           for key in sorted(result.keys()):
               tf.logging.info("  %s = %s", key, str(result[key]))

From 7050721b2d33504b1961e98d45764fc3c498fde3 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 13 Jun 2019 23:06:41 +0000
Subject: [PATCH 52/77] use master process

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 20 +++++++------------
 1 file changed, 7 insertions(+), 13 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 0c0d4c63d..380d12b47 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -288,6 +288,7 @@ def main(_):
 
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
+  master_process = True
   training_hooks = []
   global_batch_size = FLAGS.train_batch_size
   hvd_rank = 0
@@ -383,15 +384,8 @@ def main(_):
       eval_batch_size=FLAGS.eval_batch_size,
       predict_batch_size=FLAGS.predict_batch_size)
 
-  tf.logging.info("***** Running training *****",
-                  hvd.rank() if FLAGS.horovod else 'no hvd', )
-  #tf.logging.info("  Batch size = %d", FLAGS.train_batch_size)
-  #tf.logging.info("  Num steps = %d", num_train_steps)
   train_input_fn = problem.make_estimator_input_fn(
       tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
-  #train_input_fn = problem.horovod_input_fn_builder(
-      #mode=tf.estimator.ModeKeys.TRAIN, hparams=hparams,
-      #hvd=None if not FLAGS.horovod else hvd)
   training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
 
   #training_hooks.append(_OomReportingHook())
@@ -403,18 +397,18 @@ def main(_):
 
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   for n in range(train_eval_iterations):
+      if master_process:
+          tf.logging.info("***** Running training *****",
+                          hvd.rank() if FLAGS.horovod else 'no hvd')
+      # TODO: verify we are not reloading bert every time
       estimator.train(
           input_fn=train_input_fn,
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=num_train_steps)
 
-      if FLAGS.horovod:
-          barrier = hvd.allreduce(tf.constant(0))
-          with tf.Session(config=config) as sess:
-              sess.run(barrier)
-
-      if not FLAGS.horovod or hvd.rank() == 0:
+      if master_process:
+          tf.logging.info("***** Running eval *****")
           result = estimator.evaluate(
                 input_fn=eval_input_fn,
                 steps=eval_steps,

From 7f6fa4af12afb4581e95635477ab22bcbcc27bb7 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 14 Jun 2019 00:07:56 +0000
Subject: [PATCH 53/77] clean up print

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 111 +++++++++++-------
 1 file changed, 66 insertions(+), 45 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 380d12b47..c0879528d 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -45,11 +45,6 @@
     "init_checkpoint", None,
     "Initial checkpoint (usually from a pre-trained BERT model).")
 
-flags.DEFINE_bool(
-    "do_lower_case", True,
-    "Whether to lower case the input text. Should be True for uncased "
-    "models and False for cased models.")
-
 flags.DEFINE_integer(
     "max_seq_length", 128,
     "The maximum total input sequence length after WordPiece tokenization. "
@@ -73,9 +68,8 @@
                    "Total number of training epochs to perform.")
 
 flags.DEFINE_float(
-    "warmup_proportion", 0.1,
-    "Proportion of training to perform linear learning rate warmup for. "
-    "E.g., 0.1 = 10% of training.")
+    "warmup_steps", 10,
+    "Number of training steps to perform linear learning rate warmup for. ")
 
 flags.DEFINE_integer("save_checkpoints_steps", 1000,
                      "How often to save the model checkpoint.")
@@ -115,37 +109,57 @@
 
 
 # report samples/sec, total loss and learning rate during training
-class _LogEvalRunHook(tf.train.SessionRunHook):
-  def __init__(self, global_batch_size, hvd_rank=-1):
+class _LogSessionRunHook(tf.train.SessionRunHook):
+  def __init__(self, global_batch_size, display_every=10, hvd_rank=-1):
     self.global_batch_size = global_batch_size
+    self.display_every = display_every
     self.hvd_rank = hvd_rank
-    self.total_time = 0.0
-    self.count = 0
-
-  def before_run(self, run_context):
-    self.t0 = time.time()
-
-  def after_run(self, run_context, run_values):
-    elapsed_secs = time.time() - self.t0
-    self.total_time += elapsed_secs
-    self.count += 1
-
-# report samples/sec, total loss and learning rate during training
-class _LogTrainRunHook(tf.train.SessionRunHook):
-  def __init__(self, global_batch_size, hvd_rank=-1):
-    self.global_batch_size = global_batch_size
-    self.hvd_rank = hvd_rank
-    self.total_time = 0.0
+  def after_create_session(self, session, coord):
+    if FLAGS.use_fp16:
+      print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate  Loss-scaler')
+    else:
+      print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate')
+    self.elapsed_secs = 0.
     self.count = 0
-
   def before_run(self, run_context):
     self.t0 = time.time()
-    return tf.train.SessionRunArgs(
-        fetches=['step_update:0'])
+    if FLAGS.use_fp16:
+      return tf.train.SessionRunArgs(
+          fetches=['step_update:0', 'total_loss:0',
+                   'learning_rate:0', 'nsp_loss:0',
+                   'mlm_loss:0', 'loss_scale:0'])
+    else:
+      return tf.train.SessionRunArgs(
+          fetches=['step_update:0', 'total_loss:0',
+                   'learning_rate:0', 'nsp_loss:0',
+                   'mlm_loss:0'])
   def after_run(self, run_context, run_values):
-    elapsed_secs = time.time() - self.t0
-    self.total_time += elapsed_secs
+    self.elapsed_secs += time.time() - self.t0
     self.count += 1
+    if FLAGS.use_fp16:
+      global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+    else:
+      global_step, total_loss, lr, nsp_loss, mlm_loss = run_values.results
+    print_step = global_step + 1 # One-based index for printing.
+    if print_step == 1 or print_step % self.display_every == 0:
+        dt = self.elapsed_secs / self.count
+        img_per_sec = self.global_batch_size / dt
+        if self.hvd_rank >= 0:
+          if FLAGS.use_fp16:
+            print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f     %6.4e  %6.4e' %
+                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler))
+          else:
+            print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f     %6.4e' %
+                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr))
+        else:
+          if FLAGS.use_fp16:
+            print('%6i %11.1f %10.4e %10.4e %6.3f     %6.4e  %6.4e' %
+                  (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler))
+          else:
+            print('%6i %11.1f %10.4e %10.4e %6.3f     %6.4e' %
+                  (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr))
+        self.elapsed_secs = 0.
+        self.count = 0
 
 
 class _OomReportingHook(tf.train.SessionRunHook):
@@ -286,11 +300,23 @@ def main(_):
         "was only trained up to sequence length %d" %
         (FLAGS.max_seq_length, bert_config.max_position_embeddings))
 
+  # train config
+  global_batch_size = FLAGS.train_batch_size
+  # max train steps
+  num_train_steps = 1e7
+  num_warmup_steps = FLAGS.warmup_steps
+  eval_steps = 100
+  eval_frequency_steps = 100
+
+  if FLAGS.horovod:
+      num_train_steps //= hvd.size()
+      num_warmup_steps //= hvd.size()
+
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
   master_process = True
   training_hooks = []
-  global_batch_size = FLAGS.train_batch_size
+
   hvd_rank = 0
 
   tpu_cluster_resolver = None
@@ -312,6 +338,9 @@ def main(_):
       if hvd.size() > 1:
           training_hooks.append(hvd.BroadcastGlobalVariablesHook(0))
 
+      num_train_steps //= hvd.size()
+      num_warmup_steps //= hvd.size()
+
   if FLAGS.use_xla:
     config.graph_options.optimizer_options.global_jit_level = tf.OptimizerOptions.ON_1
 
@@ -328,14 +357,6 @@ def main(_):
           num_shards=FLAGS.num_tpu_cores,
           per_host_input_for_training=is_per_host))
 
-  train_examples = None
-  num_train_steps = 2000
-  num_warmup_steps = 1
-  eval_frequency_steps = 100
-  assert num_train_steps % eval_frequency_steps == 0
-  train_eval_iterations = num_train_steps // eval_frequency_steps
-  eval_steps = 100
-
   # If TPU is not available, this will fall back to normal Estimator on CPU
   # or GPU.
   from tensor2tensor.utils.trainer_lib import create_hparams, add_problem_hparams
@@ -386,7 +407,7 @@ def main(_):
 
   train_input_fn = problem.make_estimator_input_fn(
       tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
-  training_hooks.append(_LogTrainRunHook(global_batch_size, hvd_rank))
+  training_hooks.append(_LogSessionRunHook(global_batch_size, 10, -1 if not FLAGS.horovod else hvd_rank))
 
   #training_hooks.append(_OomReportingHook())
 
@@ -396,7 +417,8 @@ def main(_):
       None if not FLAGS.horovod else hvd)
 
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
-  for n in range(train_eval_iterations):
+  # TODO: replace with ValidationMonitor and EarlyStoppingHook
+  for i in range(2):
       if master_process:
           tf.logging.info("***** Running training *****",
                           hvd.rank() if FLAGS.horovod else 'no hvd')
@@ -411,8 +433,7 @@ def main(_):
           tf.logging.info("***** Running eval *****")
           result = estimator.evaluate(
                 input_fn=eval_input_fn,
-                steps=eval_steps,
-                hooks=[_LogEvalRunHook(FLAGS.eval_batch_size)])
+                steps=eval_steps)
           tf.logging.info("***** Eval results *****")
           for key in sorted(result.keys()):
               tf.logging.info("  %s = %s", key, str(result[key]))

From 5c97155f7d6fabaa7f5e0d8b6a7e2b7716fb59f5 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 14 Jun 2019 01:41:00 +0000
Subject: [PATCH 54/77] fix hooks

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 63 +++++++++----------
 1 file changed, 29 insertions(+), 34 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index c0879528d..b8f1c5876 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -64,10 +64,7 @@
 flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 
-flags.DEFINE_float("num_train_epochs", 3.0,
-                   "Total number of training epochs to perform.")
-
-flags.DEFINE_float(
+flags.DEFINE_integer(
     "warmup_steps", 10,
     "Number of training steps to perform linear learning rate warmup for. ")
 
@@ -114,50 +111,51 @@ def __init__(self, global_batch_size, display_every=10, hvd_rank=-1):
     self.global_batch_size = global_batch_size
     self.display_every = display_every
     self.hvd_rank = hvd_rank
+
   def after_create_session(self, session, coord):
-    if FLAGS.use_fp16:
-      print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate  Loss-scaler')
-    else:
-      print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate')
+    if self.hvd_rank <= 0:
+      if FLAGS.use_fp16:
+        print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate  Loss-scaler')
+      else:
+        print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate')
     self.elapsed_secs = 0.
     self.count = 0
+
   def before_run(self, run_context):
     self.t0 = time.time()
     if FLAGS.use_fp16:
       return tf.train.SessionRunArgs(
           fetches=['step_update:0', 'total_loss:0',
-                   'learning_rate:0', 'nsp_loss:0',
-                   'mlm_loss:0', 'loss_scale:0'])
+                   'learning_rate:0', 'loss_scale:0'])
     else:
       return tf.train.SessionRunArgs(
-          fetches=['step_update:0', 'total_loss:0',
-                   'learning_rate:0', 'nsp_loss:0',
-                   'mlm_loss:0'])
+          fetches=['step_update:0', 'total_loss:0', 'learning_rate:0'])
+
   def after_run(self, run_context, run_values):
     self.elapsed_secs += time.time() - self.t0
     self.count += 1
     if FLAGS.use_fp16:
-      global_step, total_loss, lr, nsp_loss, mlm_loss, loss_scaler = run_values.results
+      global_step, total_loss, lr, loss_scaler = run_values.results
     else:
-      global_step, total_loss, lr, nsp_loss, mlm_loss = run_values.results
+      global_step, total_loss, lr = run_values.results
     print_step = global_step + 1 # One-based index for printing.
     if print_step == 1 or print_step % self.display_every == 0:
         dt = self.elapsed_secs / self.count
         img_per_sec = self.global_batch_size / dt
         if self.hvd_rank >= 0:
           if FLAGS.use_fp16:
-            print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f     %6.4e  %6.4e' %
-                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler))
+            print('%2d :: %6i %11.1f %6.3f     %6.4e  %6.4e' %
+                  (self.hvd_rank, print_step, img_per_sec, total_loss, lr, loss_scaler))
           else:
-            print('%2d :: %6i %11.1f %10.4e %10.4e %6.3f     %6.4e' %
-                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr))
+            print('%2d :: %6i %11.1f %10.4e %6.3f     %6.4e' %
+                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, total_loss, lr))
         else:
           if FLAGS.use_fp16:
-            print('%6i %11.1f %10.4e %10.4e %6.3f     %6.4e  %6.4e' %
-                  (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr, loss_scaler))
+            print('%6i %11.1f %6.3f     %6.4e  %6.4e' %
+                  (print_step, img_per_sec, total_loss, lr, loss_scaler))
           else:
-            print('%6i %11.1f %10.4e %10.4e %6.3f     %6.4e' %
-                  (print_step, img_per_sec, mlm_loss, nsp_loss, total_loss, lr))
+            print('%6i %11.1f %6.3f     %6.4e' %
+                  (print_step, img_per_sec, total_loss, lr))
         self.elapsed_secs = 0.
         self.count = 0
 
@@ -224,6 +222,8 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     (total_loss, logits) = create_model(
         bert_config, is_training, input_ids, input_mask, segment_ids, label_ids,
         use_one_hot_embeddings, hparams)
+    # for logging hook to pick up
+    total_loss = tf.identity(total_loss, name='total_loss')
 
     tvars = tf.trainable_variables()
     initialized_variable_names = {}
@@ -308,16 +308,12 @@ def main(_):
   eval_steps = 100
   eval_frequency_steps = 100
 
-  if FLAGS.horovod:
-      num_train_steps //= hvd.size()
-      num_warmup_steps //= hvd.size()
-
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
   master_process = True
   training_hooks = []
 
-  hvd_rank = 0
+  hvd_rank = -1
 
   tpu_cluster_resolver = None
   if FLAGS.use_tpu and FLAGS.tpu_name:
@@ -331,8 +327,8 @@ def main(_):
       tf.logging.info("hvd.size() = %d hvd.rank() = %d", hvd.size(), hvd.rank())
       global_batch_size = FLAGS.train_batch_size * hvd.size()
       learning_rate = learning_rate * hvd.size()
-      master_process = (hvd.rank() == 0)
       hvd_rank = hvd.rank()
+      master_process = (hvd_rank == 0)
       config.gpu_options.allow_growth = True
       config.gpu_options.visible_device_list = str(hvd.local_rank())
       if hvd.size() > 1:
@@ -350,7 +346,7 @@ def main(_):
       master=FLAGS.master,
       model_dir=FLAGS.output_dir,
       session_config=config,
-      save_checkpoints_steps=FLAGS.save_checkpoints_steps,
+      save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
       log_step_count_steps=1,
       tpu_config=tf.contrib.tpu.TPUConfig(
           iterations_per_loop=FLAGS.iterations_per_loop,
@@ -407,7 +403,7 @@ def main(_):
 
   train_input_fn = problem.make_estimator_input_fn(
       tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
-  training_hooks.append(_LogSessionRunHook(global_batch_size, 10, -1 if not FLAGS.horovod else hvd_rank))
+  training_hooks.append(_LogSessionRunHook(global_batch_size, 10, hvd_rank))
 
   #training_hooks.append(_OomReportingHook())
 
@@ -420,14 +416,13 @@ def main(_):
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
   for i in range(2):
       if master_process:
-          tf.logging.info("***** Running training *****",
-                          hvd.rank() if FLAGS.horovod else 'no hvd')
+          tf.logging.info("***** Running training ***** " + str(hvd_rank))
       # TODO: verify we are not reloading bert every time
       estimator.train(
           input_fn=train_input_fn,
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
-          steps=num_train_steps)
+          steps=eval_frequency_steps)
 
       if master_process:
           tf.logging.info("***** Running eval *****")

From 5a5ee89720ff06c153f6dbace1b8a175a9919b56 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 14 Jun 2019 02:00:19 +0000
Subject: [PATCH 55/77] clean

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index b8f1c5876..a8433c643 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -115,9 +115,9 @@ def __init__(self, global_batch_size, display_every=10, hvd_rank=-1):
   def after_create_session(self, session, coord):
     if self.hvd_rank <= 0:
       if FLAGS.use_fp16:
-        print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate  Loss-scaler')
+        print('  Step samples/sec   Loss  Learning-rate  Loss-scaler')
       else:
-        print('  Step samples/sec   MLM Loss  NSP Loss  Loss  Learning-rate')
+        print('  Step samples/sec   Loss  Learning-rate')
     self.elapsed_secs = 0.
     self.count = 0
 
@@ -147,8 +147,8 @@ def after_run(self, run_context, run_values):
             print('%2d :: %6i %11.1f %6.3f     %6.4e  %6.4e' %
                   (self.hvd_rank, print_step, img_per_sec, total_loss, lr, loss_scaler))
           else:
-            print('%2d :: %6i %11.1f %10.4e %6.3f     %6.4e' %
-                  (self.hvd_rank, print_step, img_per_sec, mlm_loss, total_loss, lr))
+            print('%2d :: %6i %11.1f %6.3f     %6.4e' %
+                  (self.hvd_rank, print_step, img_per_sec, total_loss, lr))
         else:
           if FLAGS.use_fp16:
             print('%6i %11.1f %6.3f     %6.4e  %6.4e' %

From b028fc7d0f0a4ee28ed3d1741fc112ceb32ce3ce Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 14 Jun 2019 02:25:21 +0000
Subject: [PATCH 56/77] no eval steps

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index a8433c643..b38d7576f 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -305,7 +305,6 @@ def main(_):
   # max train steps
   num_train_steps = 1e7
   num_warmup_steps = FLAGS.warmup_steps
-  eval_steps = 100
   eval_frequency_steps = 100
 
   tf.gfile.MakeDirs(FLAGS.output_dir)
@@ -426,9 +425,7 @@ def main(_):
 
       if master_process:
           tf.logging.info("***** Running eval *****")
-          result = estimator.evaluate(
-                input_fn=eval_input_fn,
-                steps=eval_steps)
+          result = estimator.evaluate(input_fn=eval_input_fn)
           tf.logging.info("***** Eval results *****")
           for key in sorted(result.keys()):
               tf.logging.info("  %s = %s", key, str(result[key]))

From dd63f694be78a57924576ae8a12cf22c656f1cae Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 15 Jun 2019 00:25:05 +0000
Subject: [PATCH 57/77] fix logging

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index b38d7576f..90bb0868e 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -64,6 +64,10 @@
 flags.DEFINE_float("learning_rate", 5e-5, "The initial learning rate for Adam.")
 flags.DEFINE_bool("horovod", False, "Whether to use Horovod for multi-gpu runs")
 
+flags.DEFINE_integer(
+    "eval_frequency_steps", 10,
+    "Number of training steps per gpu between evals.")
+
 flags.DEFINE_integer(
     "warmup_steps", 10,
     "Number of training steps to perform linear learning rate warmup for. ")
@@ -305,7 +309,7 @@ def main(_):
   # max train steps
   num_train_steps = 1e7
   num_warmup_steps = FLAGS.warmup_steps
-  eval_frequency_steps = 100
+  eval_frequency_steps = FLAGS.eval_frequency_steps
 
   tf.gfile.MakeDirs(FLAGS.output_dir)
 
@@ -346,7 +350,7 @@ def main(_):
       model_dir=FLAGS.output_dir,
       session_config=config,
       save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
-      log_step_count_steps=1,
+      log_step_count_steps=100000000000,
       tpu_config=tf.contrib.tpu.TPUConfig(
           iterations_per_loop=FLAGS.iterations_per_loop,
           num_shards=FLAGS.num_tpu_cores,
@@ -402,7 +406,7 @@ def main(_):
 
   train_input_fn = problem.make_estimator_input_fn(
       tf.estimator.ModeKeys.TRAIN, hparams, None if not FLAGS.horovod else hvd)
-  training_hooks.append(_LogSessionRunHook(global_batch_size, 10, hvd_rank))
+  training_hooks.append(_LogSessionRunHook(global_batch_size, 100, hvd_rank))
 
   #training_hooks.append(_OomReportingHook())
 

From 6d9fd1b3750038601f2042723e95a04d788b319a Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 15 Jun 2019 00:52:16 +0000
Subject: [PATCH 58/77] jsut eval

---
 .../LanguageModeling/BERT/finetune_BERT.py       | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 90bb0868e..ca2a49e7b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -265,14 +265,16 @@ def tpu_scaffold():
           scaffold_fn=scaffold_fn)
     elif mode == tf.estimator.ModeKeys.EVAL:
       #logits.update({'labels': labels})
-      eval_metrics = lambda logits, labels: {
-          name: call(logits, labels)
-          for name, call in problem.all_metrics_fns.items()
-          if name in problem.eval_metrics()}
+      def metric_fn(logits, labels):
+          return {
+              name: call(logits, labels)
+              for name, call in problem.all_metrics_fns.items()
+              if name in problem.eval_metrics()}
+
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
           loss=total_loss,
-          eval_metrics=(eval_metrics, [logits, labels]),
+          eval_metrics=(metric_fn, [logits, labels]),
           scaffold_fn=scaffold_fn)
     else:
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
@@ -419,13 +421,15 @@ def main(_):
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
   for i in range(2):
       if master_process:
-          tf.logging.info("***** Running training ***** " + str(hvd_rank))
+          tf.logging.info("***** Running training *****")
       # TODO: verify we are not reloading bert every time
+      '''
       estimator.train(
           input_fn=train_input_fn,
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=eval_frequency_steps)
+      '''
 
       if master_process:
           tf.logging.info("***** Running eval *****")

From a973d8d51ccb2fa0d9895b0b2169eecb3bd108bd Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sat, 15 Jun 2019 01:03:24 +0000
Subject: [PATCH 59/77] steps = None

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index ca2a49e7b..84e430e0b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -433,7 +433,7 @@ def main(_):
 
       if master_process:
           tf.logging.info("***** Running eval *****")
-          result = estimator.evaluate(input_fn=eval_input_fn)
+          result = estimator.evaluate(input_fn=eval_input_fn, steps=None)
           tf.logging.info("***** Eval results *****")
           for key in sorted(result.keys()):
               tf.logging.info("  %s = %s", key, str(result[key]))

From 834adecb8c4a92622d2ce2ace04bec1ce89bbdc3 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sun, 16 Jun 2019 07:52:57 +0000
Subject: [PATCH 60/77] update_op as second

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 84e430e0b..981bc441f 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -266,8 +266,13 @@ def tpu_scaffold():
     elif mode == tf.estimator.ModeKeys.EVAL:
       #logits.update({'labels': labels})
       def metric_fn(logits, labels):
+
+          def get_update_op(_metric_fn, logits, labels):
+              update_op, _ = _metric_fn(logits, labels)
+              return tf.constant(0.0), update_op
+
           return {
-              name: call(logits, labels)
+              name: get_update_op(call, logits, labels)
               for name, call in problem.all_metrics_fns.items()
               if name in problem.eval_metrics()}
 

From 63b00459706c1fd75a3162a527469601a701c404 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sun, 16 Jun 2019 08:55:34 +0000
Subject: [PATCH 61/77] put train back

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 981bc441f..ed3362f5c 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -424,17 +424,15 @@ def main(_):
 
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
-  for i in range(2):
+  for i in range(10):
       if master_process:
           tf.logging.info("***** Running training *****")
       # TODO: verify we are not reloading bert every time
-      '''
       estimator.train(
           input_fn=train_input_fn,
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=eval_frequency_steps)
-      '''
 
       if master_process:
           tf.logging.info("***** Running eval *****")

From 927ec70e7c43f6a15647c41dc6db8dfac5a9194e Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sun, 16 Jun 2019 10:30:01 +0000
Subject: [PATCH 62/77] only init on first loop

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 67 +++++++++++--------
 1 file changed, 38 insertions(+), 29 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index ed3362f5c..41cb851ba 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -171,6 +171,31 @@ def before_run(self, run_context):
                                   report_tensor_allocations_upon_oom=True))
 
 
+class InitBertHook(tf.train.SessionRunHook):
+    def __init__(self, initialize_bert, init_checkpoint, hvd = None):
+        self._initialize_bert = initialize_bert
+        self._init_checkpoint = init_checkpoint
+        self._hvd = hvd
+
+    def begin(self):
+        if not self._initialize_bert:
+            return
+
+        tvars = tf.trainable_variables()
+        initialized_variable_names = {}
+        if self._init_checkpoint and (self._hvd is None or self._hvd.rank() == 0):
+          (assignment_map, initialized_variable_names
+          ) = modeling.get_assignment_map_from_checkpoint(tvars, self._init_checkpoint)
+          tf.train.init_from_checkpoint(self._init_checkpoint, assignment_map)
+
+        tf.logging.info("**** Trainable Variables ****")
+        for var in tvars:
+          init_string = ""
+          if var.name in initialized_variable_names:
+            init_string = ", *INIT_FROM_CKPT*"
+          tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string)
+
+
 def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
                  labels, use_one_hot_embeddings, hparams):
   """Creates a classification model."""
@@ -204,7 +229,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   return loss, top_out['logits']
 
 
-def model_fn_builder(bert_config, init_checkpoint, learning_rate,
+def model_fn_builder(bert_config, learning_rate,
                      num_train_steps, num_warmup_steps, use_tpu,
                      use_one_hot_embeddings, hparams, problem, hvd=None, use_fp16=False):
   """Returns `model_fn` closure for TPUEstimator."""
@@ -229,29 +254,6 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     # for logging hook to pick up
     total_loss = tf.identity(total_loss, name='total_loss')
 
-    tvars = tf.trainable_variables()
-    initialized_variable_names = {}
-    scaffold_fn = None
-    if init_checkpoint and (hvd is None or hvd.rank() == 0):
-      (assignment_map, initialized_variable_names
-      ) = modeling.get_assignment_map_from_checkpoint(tvars, init_checkpoint)
-      if use_tpu:
-
-        def tpu_scaffold():
-          tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-          return tf.train.Scaffold()
-
-        scaffold_fn = tpu_scaffold
-      else:
-        tf.train.init_from_checkpoint(init_checkpoint, assignment_map)
-
-    tf.logging.info("**** Trainable Variables ****")
-    for var in tvars:
-      init_string = ""
-      if var.name in initialized_variable_names:
-        init_string = ", *INIT_FROM_CKPT*"
-      tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string)
-
     output_spec = None
     if mode == tf.estimator.ModeKeys.TRAIN:
       train_op = optimization.create_optimizer(
@@ -262,7 +264,7 @@ def tpu_scaffold():
           mode=mode,
           loss=total_loss,
           train_op=train_op,
-          scaffold_fn=scaffold_fn)
+          scaffold_fn=None)
     elif mode == tf.estimator.ModeKeys.EVAL:
       #logits.update({'labels': labels})
       def metric_fn(logits, labels):
@@ -280,12 +282,12 @@ def get_update_op(_metric_fn, logits, labels):
           mode=mode,
           loss=total_loss,
           eval_metrics=(metric_fn, [logits, labels]),
-          scaffold_fn=scaffold_fn)
+          scaffold_fn=None)
     else:
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
           predictions={"probabilities": probabilities},
-          scaffold_fn=scaffold_fn)
+          scaffold_fn=None)
     return output_spec
 
   return model_fn
@@ -357,6 +359,7 @@ def main(_):
       model_dir=FLAGS.output_dir,
       session_config=config,
       save_checkpoints_steps=FLAGS.save_checkpoints_steps if master_process else None,
+      # so we only use our hook
       log_step_count_steps=100000000000,
       tpu_config=tf.contrib.tpu.TPUConfig(
           iterations_per_loop=FLAGS.iterations_per_loop,
@@ -389,7 +392,6 @@ def main(_):
 
   model_fn = model_fn_builder(
       bert_config=bert_config,
-      init_checkpoint=FLAGS.init_checkpoint,
       learning_rate=learning_rate,
       num_train_steps=num_train_steps,
       num_warmup_steps=num_warmup_steps,
@@ -425,9 +427,16 @@ def main(_):
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
   for i in range(10):
+      init_bert_hook = InitBertHook(
+          initialize_bert=(i == 0),
+          init_checkpoint=FLAGS.init_checkpoint,
+          hvd=hvd)
+
       if master_process:
           tf.logging.info("***** Running training *****")
-      # TODO: verify we are not reloading bert every time
+      # TODO: move init from checkpoint to a InitHook
+      # should restore parts of the graph on the begin call but only
+      # on first loop
       estimator.train(
           input_fn=train_input_fn,
           hooks=training_hooks,

From 1e3fed12dded1652c3e7b291ccf1f475cd145234 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Sun, 16 Jun 2019 10:30:50 +0000
Subject: [PATCH 63/77] only init on first loop

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 41cb851ba..553bddfb5 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -427,6 +427,7 @@ def main(_):
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
   for i in range(10):
+      # TODO: we should use a check on model_dir to decide if we initialize_bert
       init_bert_hook = InitBertHook(
           initialize_bert=(i == 0),
           init_checkpoint=FLAGS.init_checkpoint,

From ea2434c51f68cda7709028c0a01cc199b62a94b4 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Tue, 18 Jun 2019 02:22:37 +0000
Subject: [PATCH 64/77] just eval

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 553bddfb5..51468a9d5 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -266,7 +266,8 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
           train_op=train_op,
           scaffold_fn=None)
     elif mode == tf.estimator.ModeKeys.EVAL:
-      #logits.update({'labels': labels})
+
+      print('logits', logits, labels)
       def metric_fn(logits, labels):
 
           def get_update_op(_metric_fn, logits, labels):
@@ -426,7 +427,9 @@ def main(_):
 
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
-  for i in range(10):
+  #for i in range(10):
+  for i in [0]:
+      '''
       # TODO: we should use a check on model_dir to decide if we initialize_bert
       init_bert_hook = InitBertHook(
           initialize_bert=(i == 0),
@@ -443,7 +446,7 @@ def main(_):
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=eval_frequency_steps)
-
+      '''
       if master_process:
           tf.logging.info("***** Running eval *****")
           result = estimator.evaluate(input_fn=eval_input_fn, steps=None)

From 26fdb5b629abf36a7fdcbca71974bfde15235c88 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Wed, 19 Jun 2019 19:01:08 +0000
Subject: [PATCH 65/77] clean up

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 24 +++++++++++--------
 1 file changed, 14 insertions(+), 10 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 51468a9d5..406d19d3a 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -13,6 +13,9 @@
 import horovod.tensorflow as hvd
 import time
 
+from fathomtf.utils.tfutils import debug_tfprint
+
+
 flags = tf.flags
 
 FLAGS = flags.FLAGS
@@ -267,22 +270,23 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
           scaffold_fn=None)
     elif mode == tf.estimator.ModeKeys.EVAL:
 
-      print('logits', logits, labels)
-      def metric_fn(logits, labels):
+      #logits = debug_tfprint('logits', logits)
+      #labels = debug_tfprint('label_ids', label_ids)
+      def metric_fn(_logits, _labels):
 
-          def get_update_op(_metric_fn, logits, labels):
-              update_op, _ = _metric_fn(logits, labels)
+          def get_update_op(_metric_fn, _logits, _labels):
+              update_op, _ = _metric_fn(_logits, _labels)
               return tf.constant(0.0), update_op
 
           return {
-              name: get_update_op(call, logits, labels)
+              name: get_update_op(call, _logits, _labels)
               for name, call in problem.all_metrics_fns.items()
               if name in problem.eval_metrics()}
 
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
           mode=mode,
           loss=total_loss,
-          eval_metrics=(metric_fn, [logits, labels]),
+          eval_metrics=(metric_fn, [logits, label_ids]),
           scaffold_fn=None)
     else:
       output_spec = tf.contrib.tpu.TPUEstimatorSpec(
@@ -412,6 +416,7 @@ def main(_):
       config=run_config,
       train_batch_size=FLAGS.train_batch_size,
       eval_batch_size=FLAGS.eval_batch_size,
+      #eval_batch_size=1,
       predict_batch_size=FLAGS.predict_batch_size)
 
   train_input_fn = problem.make_estimator_input_fn(
@@ -427,9 +432,8 @@ def main(_):
 
   # https://github.com/horovod/horovod/issues/182#issuecomment-401486859
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
-  #for i in range(10):
-  for i in [0]:
-      '''
+  for i in range(10):
+  #for i in [0]:
       # TODO: we should use a check on model_dir to decide if we initialize_bert
       init_bert_hook = InitBertHook(
           initialize_bert=(i == 0),
@@ -446,10 +450,10 @@ def main(_):
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=eval_frequency_steps)
-      '''
       if master_process:
           tf.logging.info("***** Running eval *****")
           result = estimator.evaluate(input_fn=eval_input_fn, steps=None)
+          #result = estimator.evaluate(input_fn=eval_input_fn, steps=1)
           tf.logging.info("***** Eval results *****")
           for key in sorted(result.keys()):
               tf.logging.info("  %s = %s", key, str(result[key]))

From 3710f7731c177cae739972b1f030ee3fd6e6fd90 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 20 Jun 2019 00:47:46 +0000
Subject: [PATCH 66/77] check previous checkpoints

---
 .../LanguageModeling/BERT/finetune_BERT.py    | 23 +++++++++++++------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 406d19d3a..fa647ea67 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -271,15 +271,11 @@ def model_fn(features, labels, mode, params):  # pylint: disable=unused-argument
     elif mode == tf.estimator.ModeKeys.EVAL:
 
       #logits = debug_tfprint('logits', logits)
-      #labels = debug_tfprint('label_ids', label_ids)
+      #label_ids = debug_tfprint('label_ids', label_ids)
       def metric_fn(_logits, _labels):
 
-          def get_update_op(_metric_fn, _logits, _labels):
-              update_op, _ = _metric_fn(_logits, _labels)
-              return tf.constant(0.0), update_op
-
           return {
-              name: get_update_op(call, _logits, _labels)
+              name: call(_logits, _labels)
               for name, call in problem.all_metrics_fns.items()
               if name in problem.eval_metrics()}
 
@@ -434,9 +430,21 @@ def main(_):
   # TODO: replace with ValidationMonitor and EarlyStoppingHook
   for i in range(10):
   #for i in [0]:
+      from gcloud.gcs import fhfile
+      END_EXT = '.meta'
+      candidates = list(filter(
+          lambda path: path.startswith('model.ckpt'),
+          (os.path.basename(f) for f in fhfile.walk_path(
+              location=FLAGS.output_dir,
+              depth=1,
+              extension=END_EXT))))
+      if candidates:
+          print('checkpoints exist', candidates)
+          print('do not initialize bert')
+
       # TODO: we should use a check on model_dir to decide if we initialize_bert
       init_bert_hook = InitBertHook(
-          initialize_bert=(i == 0),
+          initialize_bert=not candidates,
           init_checkpoint=FLAGS.init_checkpoint,
           hvd=hvd)
 
@@ -450,6 +458,7 @@ def main(_):
           hooks=training_hooks,
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=eval_frequency_steps)
+
       if master_process:
           tf.logging.info("***** Running eval *****")
           result = estimator.evaluate(input_fn=eval_input_fn, steps=None)

From f63ca3d155132d13c499215209aa7343872af3ae Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Thu, 20 Jun 2019 01:37:36 +0000
Subject: [PATCH 67/77] sci

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index fa647ea67..e255fc59b 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -151,17 +151,17 @@ def after_run(self, run_context, run_values):
         img_per_sec = self.global_batch_size / dt
         if self.hvd_rank >= 0:
           if FLAGS.use_fp16:
-            print('%2d :: %6i %11.1f %6.3f     %6.4e  %6.4e' %
+            print('%2d :: %6i %11.1f %6.4e     %6.4e  %6.4e' %
                   (self.hvd_rank, print_step, img_per_sec, total_loss, lr, loss_scaler))
           else:
-            print('%2d :: %6i %11.1f %6.3f     %6.4e' %
+            print('%2d :: %6i %11.1f %6.4f     %6.4e' %
                   (self.hvd_rank, print_step, img_per_sec, total_loss, lr))
         else:
           if FLAGS.use_fp16:
-            print('%6i %11.1f %6.3f     %6.4e  %6.4e' %
+            print('%6i %11.1f %6.4f     %6.4e  %6.4e' %
                   (print_step, img_per_sec, total_loss, lr, loss_scaler))
           else:
-            print('%6i %11.1f %6.3f     %6.4e' %
+            print('%6i %11.1f %6.4f     %6.4e' %
                   (print_step, img_per_sec, total_loss, lr))
         self.elapsed_secs = 0.
         self.count = 0

From 95bfcc206bb2d3bb2e5f0f5813d8a81c2d5f879f Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Mon, 24 Jun 2019 21:12:43 +0000
Subject: [PATCH 68/77] prints

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index e255fc59b..f2346d07d 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -191,12 +191,12 @@ def begin(self):
           ) = modeling.get_assignment_map_from_checkpoint(tvars, self._init_checkpoint)
           tf.train.init_from_checkpoint(self._init_checkpoint, assignment_map)
 
-        tf.logging.info("**** Trainable Variables ****")
+        print("**** Trainable Variables ****")
         for var in tvars:
           init_string = ""
           if var.name in initialized_variable_names:
             init_string = ", *INIT_FROM_CKPT*"
-          tf.logging.info(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string)
+          print(" %d name = %s, shape = %s%s", 0 if hvd is None else hvd.rank(), var.name, var.shape, init_string)
 
 
 def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
@@ -441,6 +441,8 @@ def main(_):
       if candidates:
           print('checkpoints exist', candidates)
           print('do not initialize bert')
+      else:
+          print('initialize bert')
 
       # TODO: we should use a check on model_dir to decide if we initialize_bert
       init_bert_hook = InitBertHook(

From 2b7f19a6e6ea64144a89ad2ba9ae1c23c957f526 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Mon, 24 Jun 2019 21:22:20 +0000
Subject: [PATCH 69/77] scale LR

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index f2346d07d..edd1f4301 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -393,7 +393,7 @@ def main(_):
 
   model_fn = model_fn_builder(
       bert_config=bert_config,
-      learning_rate=learning_rate,
+      learning_rate=learning_rate if not FLAGS.horovod else FLAGS.learning_rate * hvd.size(),
       num_train_steps=num_train_steps,
       num_warmup_steps=num_warmup_steps,
       use_tpu=FLAGS.use_tpu,

From b66741c3826842927a14b41cf8e5d783f61253de Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Mon, 24 Jun 2019 21:32:01 +0000
Subject: [PATCH 70/77] append init bert hook each time

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index edd1f4301..27ea93422 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -452,12 +452,13 @@ def main(_):
 
       if master_process:
           tf.logging.info("***** Running training *****")
+
       # TODO: move init from checkpoint to a InitHook
       # should restore parts of the graph on the begin call but only
       # on first loop
       estimator.train(
           input_fn=train_input_fn,
-          hooks=training_hooks,
+          hooks=training_hooks + [init_bert_hook],
           # TODO: LR dependent on train steps, are we resetting this every time then?
           steps=eval_frequency_steps)
 

From d008553e6510ece7f46266f2d08a5815c7168da2 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 18:23:35 +0000
Subject: [PATCH 71/77] batching

---
 TensorFlow/LanguageModeling/BERT/modeling.py | 48 +++++++++++++++++++-
 1 file changed, 47 insertions(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py
index 828a93872..f56b2bd7c 100644
--- a/TensorFlow/LanguageModeling/BERT/modeling.py
+++ b/TensorFlow/LanguageModeling/BERT/modeling.py
@@ -166,6 +166,9 @@ def __init__(self,
     batch_size = input_shape[0]
     seq_length = input_shape[1]
 
+    #from fathomtf.utils.tfutils import debug_tfprint
+    #input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape)
+
     if input_mask is None:
       input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
 
@@ -197,12 +200,40 @@ def __init__(self,
             max_position_embeddings=config.max_position_embeddings,
             dropout_prob=config.hidden_dropout_prob)
 
+      # start chunk
+      chunk_size = 64
+
+      # [B, T, D]
+      #self.embedding_output = debug_tfprint('embedding output', self.embedding_output, tf.shape)
+      depth = config.hidden_size
+
+      batch_multiplier = seq_length // chunk_size
+      new_batch_size = batch_size * batch_multiplier
+
+      # [B * T/chunk_size, chunk_size, D]
+      self.embedding_output = tf.reshape(self.embedding_output, [new_batch_size, chunk_size, depth])
+      #self.embedding_output = debug_tfprint('transformed embedding output', self.embedding_output, tf.shape)
+
+      # [B, T]
+      #input_mask = debug_tfprint('input mask before', input_mask, tf.shape)
+      #token_type_ids = debug_tfprint('token type ids before', token_type_ids, tf.shape)
+      # [B * T/chunk_size, chunk_size]
+      input_mask = tf.reshape(input_mask, [new_batch_size, chunk_size])
+      token_type_ids = tf.reshape(token_type_ids, [new_batch_size, chunk_size])
+
+      #input_mask = debug_tfprint('input mask after', input_mask, tf.shape)
+      #token_type_ids = debug_tfprint('token type ids after', token_type_ids, tf.shape)
+      # end chunk
+
       with tf.variable_scope("encoder"):
         # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
         # mask of shape [batch_size, seq_length, seq_length] which is used
         # for the attention scores.
+        # [B * T/chunk_size, chunk_size, D], [B * T/chunk_size, chunk_size]
+        # [B * T/chunk_size, chunk_size, chunk_size]
         attention_mask = create_attention_mask_from_input_mask(
-            input_ids, input_mask)
+            #input_ids, input_mask)
+            self.embedding_output, input_mask)
 
         # Run the stacked transformer.
         # `sequence_output` shape = [batch_size, seq_length, hidden_size].
@@ -220,6 +251,18 @@ def __init__(self,
             do_return_all_layers=True)
 
       self.sequence_output = tf.cast(self.all_encoder_layers[-1], tf.float32)
+
+      # start chunk
+      # [B * T/chunk_size, chunk_size, D]
+      #self.sequence_output = debug_tfprint('sequence output', self.sequence_output, tf.shape)
+      # [B, T/chunk_size, chunk_size, D]
+      self.sequence_output = tf.reshape(
+          self.sequence_output, [batch_size, batch_multiplier, chunk_size, depth])
+      # [B, T/chunk_size, D]
+      #self.sequence_output = self.sequence_output[:, :, 0, :]
+      #self.sequence_output = debug_tfprint('sequence output final', self.sequence_output, tf.shape)
+      # end chunk
+
       # The "pooler" converts the encoded sequence tensor of shape
       # [batch_size, seq_length, hidden_size] to a tensor of shape
       # [batch_size, hidden_size]. This is necessary for segment-level
@@ -542,6 +585,9 @@ def create_attention_mask_from_input_mask(from_tensor, to_mask):
   to_shape = get_shape_list(to_mask, expected_rank=2)
   to_seq_length = to_shape[1]
 
+  #from fathomtf.utils.tfutils import debug_tfprint
+  #to_mask = debug_tfprint('to_mask', to_mask, tf.shape)
+
   to_mask = tf.cast(
       tf.reshape(to_mask, [batch_size, 1, to_seq_length]), tf.float32)
 

From 2a4f869c5986084119ec2172ff1818832a7f1820 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 18:44:17 +0000
Subject: [PATCH 72/77] typo

---
 TensorFlow/LanguageModeling/BERT/modeling.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py
index f56b2bd7c..15a10d707 100644
--- a/TensorFlow/LanguageModeling/BERT/modeling.py
+++ b/TensorFlow/LanguageModeling/BERT/modeling.py
@@ -259,7 +259,7 @@ def __init__(self,
       self.sequence_output = tf.reshape(
           self.sequence_output, [batch_size, batch_multiplier, chunk_size, depth])
       # [B, T/chunk_size, D]
-      #self.sequence_output = self.sequence_output[:, :, 0, :]
+      self.sequence_output = self.sequence_output[:, :, 0, :]
       #self.sequence_output = debug_tfprint('sequence output final', self.sequence_output, tf.shape)
       # end chunk
 

From 78822ce8c354033b9ec28bc45e635b38e79456e3 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 19:03:04 +0000
Subject: [PATCH 73/77] clean

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 27ea93422..9a44aad8d 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -216,12 +216,6 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
 
   # [B, 384, D]
   body_outputs = model.get_sequence_output()
-  #extended_batch_size = tf.shape(body_outputs)[0]
-  #chunk_size = tf.shape(body_outputs)[1]
-  #depth = tf.shape(body_outputs)[2]
-  #batch_size = extended_batch_size / chunk_size
-
-  #body_outputs = tf.reshape(body_outputs, [batch_size, extended_batch_size, depth])
   body_outputs = tf.expand_dims(body_outputs, axis=-2)
 
   top_out = target_modality.top(body_outputs, None)

From da084d9e2e94e6a10818bfa4ea442bee08d96e38 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 21:53:54 +0000
Subject: [PATCH 74/77] chunk before embeddings

---
 TensorFlow/LanguageModeling/BERT/modeling.py | 44 ++++++++------------
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py
index 15a10d707..d580b7e44 100644
--- a/TensorFlow/LanguageModeling/BERT/modeling.py
+++ b/TensorFlow/LanguageModeling/BERT/modeling.py
@@ -166,8 +166,23 @@ def __init__(self,
     batch_size = input_shape[0]
     seq_length = input_shape[1]
 
-    #from fathomtf.utils.tfutils import debug_tfprint
-    #input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape)
+    from fathomtf.utils.tfutils import debug_tfprint
+    # [B, T]
+    input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape)
+
+    # start chunk
+    chunk_size = 64
+
+    depth = config.hidden_size
+    batch_multiplier = seq_length // chunk_size
+    new_batch_size = batch_size * batch_multiplier
+
+    # [B * T/chunk_size, chunk_size]
+    input_ids = tf.reshape(input_ids, [new_batch_size, chunk_size])
+    input_mask = tf.reshape(input_mask, [new_batch_size, chunk_size])
+    token_type_ids = tf.reshape(token_type_ids, [new_batch_size, chunk_size])
+
+    # end chunk
 
     if input_mask is None:
       input_mask = tf.ones(shape=[batch_size, seq_length], dtype=tf.int32)
@@ -200,31 +215,6 @@ def __init__(self,
             max_position_embeddings=config.max_position_embeddings,
             dropout_prob=config.hidden_dropout_prob)
 
-      # start chunk
-      chunk_size = 64
-
-      # [B, T, D]
-      #self.embedding_output = debug_tfprint('embedding output', self.embedding_output, tf.shape)
-      depth = config.hidden_size
-
-      batch_multiplier = seq_length // chunk_size
-      new_batch_size = batch_size * batch_multiplier
-
-      # [B * T/chunk_size, chunk_size, D]
-      self.embedding_output = tf.reshape(self.embedding_output, [new_batch_size, chunk_size, depth])
-      #self.embedding_output = debug_tfprint('transformed embedding output', self.embedding_output, tf.shape)
-
-      # [B, T]
-      #input_mask = debug_tfprint('input mask before', input_mask, tf.shape)
-      #token_type_ids = debug_tfprint('token type ids before', token_type_ids, tf.shape)
-      # [B * T/chunk_size, chunk_size]
-      input_mask = tf.reshape(input_mask, [new_batch_size, chunk_size])
-      token_type_ids = tf.reshape(token_type_ids, [new_batch_size, chunk_size])
-
-      #input_mask = debug_tfprint('input mask after', input_mask, tf.shape)
-      #token_type_ids = debug_tfprint('token type ids after', token_type_ids, tf.shape)
-      # end chunk
-
       with tf.variable_scope("encoder"):
         # This converts a 2D mask of shape [batch_size, seq_length] to a 3D
         # mask of shape [batch_size, seq_length, seq_length] which is used

From 2fee2eaa892960d1fc6e95f1f9335b3bfffabbe4 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 21:57:00 +0000
Subject: [PATCH 75/77] no print

---
 TensorFlow/LanguageModeling/BERT/modeling.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/modeling.py b/TensorFlow/LanguageModeling/BERT/modeling.py
index d580b7e44..7345aaee6 100644
--- a/TensorFlow/LanguageModeling/BERT/modeling.py
+++ b/TensorFlow/LanguageModeling/BERT/modeling.py
@@ -166,9 +166,9 @@ def __init__(self,
     batch_size = input_shape[0]
     seq_length = input_shape[1]
 
-    from fathomtf.utils.tfutils import debug_tfprint
+    #from fathomtf.utils.tfutils import debug_tfprint
     # [B, T]
-    input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape)
+    #input_ids = debug_tfprint('input ids before embedding', input_ids, tf.shape)
 
     # start chunk
     chunk_size = 64

From de23b8adc2d36118cc9359965c101991347e1411 Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 22:47:01 +0000
Subject: [PATCH 76/77] do not expand

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 9a44aad8d..4373a444a 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -214,9 +214,8 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
       use_one_hot_embeddings=use_one_hot_embeddings,
       compute_type=tf.float32)
 
-  # [B, 384, D]
-  body_outputs = model.get_sequence_output()
-  body_outputs = tf.expand_dims(body_outputs, axis=-2)
+  # [B, T/chunk_size, D]
+  body_output = model.get_sequence_output()
 
   top_out = target_modality.top(body_outputs, None)
 

From a4ae57c73c75e3c435521a9d8b2a2169ad531aff Mon Sep 17 00:00:00 2001
From: rllin-fathom <randall@fathomhealth.co>
Date: Fri, 28 Jun 2019 22:48:19 +0000
Subject: [PATCH 77/77] typo

---
 TensorFlow/LanguageModeling/BERT/finetune_BERT.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
index 4373a444a..8b2991016 100644
--- a/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
+++ b/TensorFlow/LanguageModeling/BERT/finetune_BERT.py
@@ -217,7 +217,7 @@ def create_model(bert_config, is_training, input_ids, input_mask, segment_ids,
   # [B, T/chunk_size, D]
   body_output = model.get_sequence_output()
 
-  top_out = target_modality.top(body_outputs, None)
+  top_out = target_modality.top(body_output, None)
 
   num, den = target_modality.loss(top_out, labels)
   loss = num / den