In [0]:
try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

TensorFlow 2.x selected.


**Firstly**, we need to set up Colab TPU running environment, verify a TPU device is succesfully connected and upload credentials to TPU for GCS bucket usage.

In [0]:
import sys
username=''#@param {type:"string"}
password=''#@param {type:"string"}
!test -d MarkedBERT || git clone https://{username}:{password}@github.com/{username}/Marked_BERT.git
if not 'Marked_BERT' in sys.path:
  sys.path += ['Marked_BERT']

Cloning into 'MarkedBERT'...
remote: Enumerating objects: 186, done.[K
remote: Counting objects:   0% (1/186)[Kremote: Counting objects:   1% (2/186)[Kremote: Counting objects:   2% (4/186)[Kremote: Counting objects:   3% (6/186)[Kremote: Counting objects:   4% (8/186)[Kremote: Counting objects:   5% (10/186)[Kremote: Counting objects:   6% (12/186)[Kremote: Counting objects:   7% (14/186)[Kremote: Counting objects:   8% (15/186)[Kremote: Counting objects:   9% (17/186)[Kremote: Counting objects:  10% (19/186)[Kremote: Counting objects:  11% (21/186)[Kremote: Counting objects:  12% (23/186)[Kremote: Counting objects:  13% (25/186)[Kremote: Counting objects:  14% (27/186)[Kremote: Counting objects:  15% (28/186)[Kremote: Counting objects:  16% (30/186)[Kremote: Counting objects:  17% (32/186)[Kremote: Counting objects:  18% (34/186)[Kremote: Counting objects:  19% (36/186)[Kremote: Counting objects:  20% (38/186)[Kremote: Counting objects:  21%

In [0]:
!pip install -r MarkedBERT/requirements_colab.txt

In [0]:
import tensorflow as tf
tf.__version__

'2.2.0-dev20200311'

In [0]:
import os
assert os.environ['COLAB_TPU_ADDR'], 'Make sure to select TPU from Edit > Notebook settings > Hardware accelerator'

# DATA FILES NEED TO BE IN GCS BUCKET SO THAT THE TPU CAN ACCESS IT 
THE FIRST RUN: THE TPU HAS NO ACCESS PERMISSIONS TO THE BUCKET SO U NEED TO CHANGE THE PERMISSIONS OF YOUR BUCKET MANUALLY TO ADD THE TPU ( THE NAME WILL BE IN THE EXCEPTION TEXT)

In [0]:
import os
os.environ['COLAB_SKIP_TPU_AUTH'] = '1'
from google.colab import auth
auth.authenticate_user()

In [0]:
project_id="GCS project id "#@param {type:"string"}
bucket_name="Bucket where data is stored"#@param {type:"string"}
!gcloud config set project {project_id}

Updated property [core/project].


**Thirdly**, prepare for training:

* Specify training data.
* Specify BERT pretrained model
* Specify GS bucket, create output directory for model checkpoints

In [0]:
MODEL_TYPE = "bert" #@param {type:"string"}
MODEL_NAME = "bert-base-uncased" #@param {type:"string"}

OUTPUT_DIR = "gs://my_bucket/output_dir/" #@param {type:"string"}
assert OUTPUT_DIR, 'Must specify an existing GCS bucket name'
tf.io.gfile.makedirs(OUTPUT_DIR)
print('***** Model output directory: {} *****'.format(OUTPUT_DIR))

# Now we need to specify the input data dir. Should contain the .tfrecord files 
# and the supporting query-docids mapping files.
DATA_DIR = "gs://my_bucket/train_data" #@param {type:"string"}
print('***** Data directory: {} *****'.format(DATA_DIR))

FILE_NAME= "dataset_trec_mu_pair.tf" #@param {type:"string"}
SET_NAME= "trec" #@param {type:"string"}

# need to mount your drive and put the path to the directory containing the .h5 file
CHKPT_PATH="/content/drive/My Drive/checkpoints/base" #@param {type:"string"}


***** Model output directory: gs://lila_data/output_dir/un_mark_pass *****
***** Data directory: gs://lila_data/train *****


**Now, we can start training/evaluating**

In [0]:
# coding=utf-8
import collections
import datetime
import glob
import math
import os
import re
import time

import numpy as np
import tensorflow as tf
from absl import app, flags, logging
from tqdm import trange
from tqdm.notebook import tqdm

from transformers import (
    TF2_WEIGHTS_NAME,
    BertConfig,
    BertTokenizer,
    DistilBertConfig,
    DistilBertTokenizer,
    RobertaConfig,
    RobertaTokenizer,
    TFBertForSequenceClassification,
    TFDistilBertForSequenceClassification,
    TFRobertaForSequenceClassification,
)
#local modules
from Modeling import get_dataset, create_optimizer

In [0]:
ALL_MODELS = sum(
    (tuple(conf.pretrained_config_archive_map.keys()) for conf in (BertConfig, RobertaConfig, DistilBertConfig)), ()
)

MODEL_CLASSES = {
    "bert": (BertConfig, TFBertForSequenceClassification, BertTokenizer),
    "roberta": (RobertaConfig, TFRobertaForSequenceClassification, RobertaTokenizer),
    "distilbert": (DistilBertConfig, TFDistilBertForSequenceClassification, DistilBertTokenizer),
}

#Arguments 
args=dict()

args["data_dir"]= DATA_DIR
args["model_type"]=MODEL_TYPE
args["model_name_or_path"]=MODEL_NAME
args["output_dir"]=OUTPUT_DIR
args["transformer_checkpoints"]= CHKPT_PATH
args["max_seq_length"]=512
args["tpu"]= f"grpc://{os.environ['COLAB_TPU_ADDR']}"
args["do_train"]= False
args['do_eval']= True
args["per_device_train_batch_size"]= 16
args['per_device_eval_batch_size']= 4
args["max_steps"]=100000
args["warmup_steps"]=10000
args["learning_rate"]=3e-6
args["adam_epsilon"]=1e-8
args["logging_steps"]=100
args["seed"]=42
args["max_grad_norm"]=1.0
args["save_steps"]=5000
args["overwrite_output_dir"]=False
args["fp16"]=False
args["no_cuda"]=False
args['gpus']=None
args['config_name']=None
args['cache_dir']=None
args['tokenizer_name']=None
args['num_tpu_cores']=8
args['do_predict']=False
args['evaluate_during_training']=False
args['do_lower_case']=False
args['num_train_epochs']=1
args['overwrite_cache']=False
args['eval_all_checkpoints']=False
args["msmarco_output"]= True
args["num_eval_docs"] =1000

In [0]:

def train(
    args, strategy,  train_dataset, model, num_train_examples, train_batch_size
):
    if args["max_steps"] > 0:
        num_train_steps = args["max_steps"] 
        args["num_train_epochs"] = 1 # only consider the case where max_steps < one_epoch_steps
    else:
        num_train_steps = (
            math.ceil(num_train_examples / train_batch_size)
            * args["num_train_epochs"]
        )

    with strategy.scope():
        loss_fct = tf.keras.losses.SparseCategoricalCrossentropy(reduction=tf.keras.losses.Reduction.NONE)
        optimizer = create_optimizer(args["learning_rate"], num_train_steps, args["warmup_steps"])

        if args["fp16"]:
            optimizer = tf.keras.mixed_precision.experimental.LossScaleOptimizer(optimizer, "dynamic")

        loss_metric = tf.keras.metrics.Mean(name="loss", dtype=tf.float32)
        
    logging.info("***** Running training *****")
    logging.info("  Num examples = %d", num_train_examples)
    logging.info("  Num Epochs = %d", args["num_train_epochs"])
    logging.info("  Instantaneous batch size per device = %d", args["per_device_train_batch_size"])
    logging.info(
        "  Total train batch size (w. parallel, distributed ) = %d",
        train_batch_size,
    )

    logging.info("  Total training steps = %d", num_train_steps)

    model.summary()    

    @tf.function
    def train_step(train_features, train_labels):
        def step_fn(train_features, train_labels):
            inputs = {"attention_mask": train_features["attention_mask"], "training": True}

            if args["model_type"] != "distilbert":
                inputs["token_type_ids"] = (
                    train_features["token_type_ids"] if args["model_type"] in ["bert", "xlnet"] else None
                )

            with tf.GradientTape() as tape:
                logits = model(train_features["input_ids"], **inputs)[0]
                cross_entropy = loss_fct(train_labels, logits) #per_example_losses
                loss = tf.reduce_sum(cross_entropy) * (1.0 / train_batch_size) #per_replica_loss
                if args['fp16']:
                    scaled_loss = optimizer.get_scaled_loss(loss)

            if args['fp16']:
              scaled_grads = tape.gradient(scaled_loss, model.trainable_variables)
              grads = optimizer.get_unscaled_gradients(scaled_grads)
            else:
              grads = tape.gradient(loss, model.trainable_variables)
            optimizer.apply_gradients(zip(grads, model.trainable_variables))

            return cross_entropy

        per_example_losses = strategy.experimental_run_v2(step_fn, args=(train_features, train_labels))
        sum_loss = strategy.reduce(tf.distribute.ReduceOp.SUM, per_example_losses, axis=0)
        
        return sum_loss / train_batch_size #loss over replicas

    current_time = datetime.datetime.now()

    # Train or resume training from chkpt
    global_step = 0
    epochs_trained = 0
    current_step = 0

    with strategy.scope():
        # Check if continuing training from a checkpoint
        checkpoint = tf.train.Checkpoint(step=tf.Variable(0), model=model, optimizer=optimizer)
        manager = tf.train.CheckpointManager(checkpoint, f'{args["output_dir"]}/tf_ckpts', max_to_keep=3)
        latest_checkpoint_file = manager.latest_checkpoint
        if latest_checkpoint_file:
          logging.info(
              'Checkpoint file %s found and restoring from '
              'checkpoint', latest_checkpoint_file)
          checkpoint.restore(latest_checkpoint_file)
          logging.info('Loading from checkpoint file completed')
    if latest_checkpoint_file:
        current_step = optimizer.iterations.numpy()
        logging.info(
              'Resume training from step %s', current_step )
    print("\ncurrent step = ", current_step)
    tr_loss, logging_loss = 0.0, 0.0

    epoch_iterator = trange(
        epochs_trained, int(args["num_train_epochs"]), desc="Epoch")
    global_step = current_step 
    # global_step without gradient_accumul ==step (useless) 
    #if multiple epochs global_step may contain steps from multiple epochs, 
    #step should be step_in_current_epoch, epochs_trained calculated

    for epoch in epoch_iterator:
        train_iterator = tqdm(train_dataset, total=num_train_steps, desc="Iteration")

        with strategy.scope():
            for step, (train_features, train_labels) in enumerate(train_iterator):
                # Skip past any already trained steps if resuming training
                if current_step > 0:
                    current_step -= 1
                    continue

                loss = train_step(train_features, train_labels)
                    
                loss_metric(loss)

                global_step += 1
                checkpoint.step.assign_add(1)

                if args["logging_steps"] > 0 and global_step % args["logging_steps"] == 0:
                        # Log metrics
                        lr = optimizer.learning_rate
                        learning_rate = lr(step)
                        
                        logging_loss = loss_metric.result()

                        train_iterator.set_postfix(loss=logging_loss.numpy(), step=global_step, lr=learning_rate.numpy())

                if args["save_steps"] > 0 and global_step % args["save_steps"] == 0:
                        ## TF2 checkpoint for training 
                        save_path = manager.save()
                        logging.info("Saved checkpoint for step %s: %s",global_step,save_path)
                        
                if args["save_steps"] > 0 and global_step % (args["save_steps"]*10) == 0:
                        # Save model checkpoint
                        output_dir = os.path.join(args["transformer_checkpoints"], f"checkpoint-{global_step}")

                        if not os.path.exists(output_dir):
                            os.makedirs(output_dir)

                        model.save_pretrained(output_dir)
                        
                        logging.info("Saving model checkpoint to %s", output_dir)
                if args['max_steps'] > 0 and global_step > args['max_steps']:
                    train_iterator.close()
                    break
        if args['max_steps'] > 0 and global_step > args['max_steps']:
            epoch_iterator.close()
            break

        epoch_iterator.write(f"loss epoch {epoch + 1}: {loss_metric.result()}")

        loss_metric.reset_states()

    logging.info("  Training took time = {}".format(datetime.datetime.now() - current_time))


In [0]:
def evaluate(args, strategy, eval_dataset, trained_model, num_eval_examples,
            eval_batch_size, global_step):

    @tf.function
    def eval_step(features, labels):
      """Computes predictions on distributed devices."""

      def _eval_step_fn(eval_features, labels):
        """Replicated predictions."""
        inputs = {"attention_mask": eval_features["attention_mask"], "training": False}

        if args["model_type"] != "distilbert":
            inputs["token_type_ids"] = (
                eval_features["token_type_ids"] if args["model_type"] in ["bert", "xlnet"] else None
            )

        logits = trained_model(eval_features["input_ids"], **inputs)[0]

        return logits, labels, eval_features['q_id'], eval_features['d_id']

      preds, labels, q_id, d_id = strategy.experimental_run_v2(
          _eval_step_fn, args=(features, labels))
      # outputs: current batch logits as a tuple of shard logits
      preds = tf.nest.map_structure(strategy.experimental_local_results,
                                      preds)
      labels = tf.nest.map_structure(strategy.experimental_local_results, labels)
      q_id = tf.nest.map_structure(strategy.experimental_local_results,
                                      q_id)
      d_id = tf.nest.map_structure(strategy.experimental_local_results,
                                      d_id)
      return preds, labels, q_id, d_id

    
    #start eval
    num_eval_steps = (
            math.ceil(num_eval_examples / eval_batch_size)
        )
    logging.info("***** Running evaluation *****")
    logging.info("  Num examples = %d", num_eval_examples)
    logging.info("  Instantaneous batch size per device = %d", 
                 args["per_device_eval_batch_size"])
    logging.info(
        "  Total train batch size (w. parallel, distributed ) = %d",
        eval_batch_size,
    )

    logging.info("  Total evaluation steps = %d", num_eval_steps)
    eval_iterator = tqdm(eval_dataset, total=num_eval_steps, 
                                   desc="Iteration")

    preds = None
    golds = None
    qids = None
    dids= None
    msmarco_file = tf.io.gfile.GFile( f"{args['output_dir']}/predictions_{SET_NAME}_{global_step}.tsv", 'w')
    for step, (eval_features, eval_labels) in enumerate(eval_iterator):
        with strategy.scope():
            outputs, labels, q_ids, d_ids  = eval_step(eval_features, eval_labels)
        
        for i in range(args['n_device']):
            if preds is None:
                preds = outputs[i].numpy()[:,1]
                golds = labels[i].numpy()
                qids = q_ids[i].numpy()
                dids = d_ids[i].numpy()
            else:
                preds = np.append(preds, outputs[i].numpy()[:,1], axis=0)
                golds = np.append(golds, labels[i].numpy(), axis=0)
                qids = np.append(qids, q_ids[i].numpy(), axis=0)
                dids = np.append(dids, d_ids[i].numpy(), axis=0)
                
        for qid,did,pred,label in zip(qids,dids,preds,golds):
              msmarco_file.write("\t".join((str(qid), str(did), str(pred), str(label))) + "\n")
        preds = None
        golds = None
        qids = None
        dids= None
    msmarco_file.close()

In [0]:
def main(args):
    logging.set_verbosity(logging.INFO)

    if args["fp16"]:
        tf.config.optimizer.set_experimental_options({"auto_mixed_precision": True})

    if args["tpu"]:
        print(args['tpu'])
        resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu=args["tpu"])
        tf.config.experimental_connect_to_cluster(resolver)
        tf.tpu.experimental.initialize_tpu_system(resolver)       
        strategy = tf.distribute.experimental.TPUStrategy(resolver)
        args["n_device"] = args["num_tpu_cores"]

    elif args["no_cuda"]:
        args["n_device"] = 1
        strategy = tf.distribute.OneDeviceStrategy(device="/cpu:0")

    elif args["gpus"]:
        if len(args["gpus"].split(",")) > 1:
            args["n_device"] = len([f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
            strategy = tf.distribute.MirroredStrategy(devices=[f"/gpu:{gpu}" for gpu in args["gpus"].split(",")])
        else:
            args["n_device"] = len(args["gpus"].split(","))
            strategy = tf.distribute.OneDeviceStrategy(device="/gpu:" + args["gpus"].split(",")[0])

    else:
        devices = get_available_gpus()
        logging.info("\ndevices= %s \n", devices)
        args["n_device"] = len(devices)
        strategy = tf.distribute.MirroredStrategy(devices=devices)
    
    logging.warning(
        "n_device: %s, distributed training: %s, 16-bits training: %s",
        args["n_device"],
        bool(args["n_device"] > 1),
        args["fp16"],
    )
    
    logging.info("\nStrategy = %s\n",strategy)

    num_labels = 2
    tf.random.set_seed(args["seed"])
    config_class, model_class, tokenizer_class = MODEL_CLASSES[args["model_type"]]
    config = config_class.from_pretrained(
        args["config_name"] if args["config_name"] else args["model_name_or_path"],
        num_labels=num_labels,
        cache_dir=args["cache_dir"] if args["cache_dir"] else None,
    )

    logging.info("Training/evaluation parameters %s", args)
    
    # Training
    if args["do_train"]:
        tokenizer = tokenizer_class.from_pretrained(
            args["tokenizer_name"] if args["tokenizer_name"] else args["model_name_or_path"],
            do_lower_case=args["do_lower_case"],
            cache_dir=args["cache_dir"] if args["cache_dir"] else None,
        )

        with strategy.scope():
            model = model_class.from_pretrained(
                args["model_name_or_path"],
                from_pt=bool(".bin" in args["model_name_or_path"]),
                config=config,
                cache_dir=args["cache_dir"] if args["cache_dir"] else None,
            )
            model.layers[-1].activation = tf.keras.activations.softmax

        train_batch_size = args["per_device_train_batch_size"] * max(1,args["n_device"])

        filename = tf.io.gfile.glob(f"{args['data_dir']}/dataset_train_mu_mark_pair.tf")
        train_dataset, num_train_examples = get_dataset( filename,
                train_batch_size, args["max_seq_length"], is_training_set=True
        )
        train_dataset = strategy.experimental_distribute_dataset(train_dataset)
        train(
            args,
            strategy,
            train_dataset,
            model,
            num_train_examples,
            train_batch_size,
        )

        if not os.path.exists(args["transformer_checkpoints"]):
            os.makedirs(args["transformer_checkpoints"])

        logging.info("Saving model to %s", args["transformer_checkpoints"])

        model.save_pretrained(args["transformer_checkpoints"])
        tokenizer.save_pretrained(args["transformer_checkpoints"])
        out= 0
    # Evaluating
    if args["do_eval"]:
        checkpoints = []

        if args["eval_all_checkpoints"]:
            checkpoints = list(
                os.path.dirname(c)
                for c in sorted(
                    glob.glob(args["transformer_checkpoints"] + "/**/" + TF2_WEIGHTS_NAME, recursive=True),
                    key=lambda f: int("".join(filter(str.isdigit, f)) or -1),
                )
            )

        logging.info("Evaluate the following checkpoints: %s", checkpoints)

        if len(checkpoints) == 0:
            # last checkpoint
            checkpoints.append(args["transformer_checkpoints"])

        eval_batch_size = args["per_device_eval_batch_size"] * max(1,args["n_device"])

        filename = tf.io.gfile.glob(f"{args['data_dir']}/dataset_dev_ids.tf")
        eval_dataset, num_eval_examples = get_dataset(filename, eval_batch_size, args["max_seq_length"], num_skip = 6980000-32)
        num_eval_examples = 32
        print('num_eval_examples= ', num_eval_examples)
        eval_dataset = strategy.experimental_distribute_dataset(eval_dataset)

        for checkpoint in checkpoints:
            global_step = checkpoint.split("-")[-1] if re.match(".*checkpoint-[0-9]", checkpoint) else "final"
            print("global step= ", global_step)

            with strategy.scope():
                trained_model = model_class.from_pretrained(checkpoint)
                trained_model.summary()
        
            evaluate(
                args,
                strategy,
                eval_dataset,
                trained_model,
                num_eval_examples,
                eval_batch_size,
                global_step,
            )

In [0]:
main(args)

INFO:absl:Entering into master device scope: /job:worker/replica:0/task:0/device:CPU:0


grpc://10.16.246.122:8470
INFO:tensorflow:Initializing the TPU system: grpc://10.16.246.122:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.16.246.122:8470


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)
INFO:absl:
Strategy = <tensorflow.python.distribute.tpu_strategy.TPUStrategy object at 0x7f457bfbf6a0>



HBox(children=(IntProgress(value=0, description='Downloading', max=361, style=ProgressStyle(description_width=…

INFO:absl:Training/evaluation parameters {'data_dir': 'gs://lila_data/train', 'model_type': 'bert', 'model_name_or_path': 'bert-base-uncased', 'output_dir': 'gs://lila_data/output_dir/un_mark_pass', 'transformer_checkpoints': '/content/drive/My Drive/checkpoints/un_mark_pass', 'max_seq_length': 512, 'tpu': 'grpc://10.16.246.122:8470', 'do_train': True, 'do_eval': False, 'per_device_train_batch_size': 16, 'per_device_eval_batch_size': 4, 'max_steps': 100000, 'warmup_steps': 10000, 'learning_rate': 3e-06, 'adam_epsilon': 1e-08, 'logging_steps': 100, 'seed': 42, 'max_grad_norm': 1.0, 'save_steps': 5000, 'overwrite_output_dir': False, 'fp16': False, 'no_cuda': False, 'gpus': None, 'config_name': None, 'cache_dir': None, 'tokenizer_name': None, 'num_tpu_cores': 8, 'do_predict': False, 'evaluate_during_training': False, 'do_lower_case': False, 'num_train_epochs': 1, 'overwrite_cache': False, 'eval_all_checkpoints': False, 'msmarco_output': True, 'num_eval_docs': 1000, 'n_device': 8}





HBox(children=(IntProgress(value=0, description='Downloading', max=231508, style=ProgressStyle(description_wid…




HBox(children=(IntProgress(value=0, description='Downloading', max=536063208, style=ProgressStyle(description_…




INFO:absl:***** Running training *****
INFO:absl:  Num examples = 79561622
INFO:absl:  Num Epochs = 1
INFO:absl:  Instantaneous batch size per device = 16
INFO:absl:  Total train batch size (w. parallel, distributed ) = 128
INFO:absl:  Total training steps = 100000


Model: "tf_bert_for_sequence_classification"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
bert (TFBertMainLayer)       multiple                  109482240 
_________________________________________________________________
dropout_37 (Dropout)         multiple                  0         
_________________________________________________________________
classifier (Dense)           multiple                  1538      
Total params: 109,483,778
Trainable params: 109,483,778
Non-trainable params: 0
_________________________________________________________________


INFO:absl:Checkpoint file gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-13 found and restoring from checkpoint
INFO:absl:Loading from checkpoint file completed
INFO:absl:Resume training from step 65000
Epoch:   0%|          | 0/1 [00:00<?, ?it/s]


current step =  65000


HBox(children=(IntProgress(value=0, description='Iteration', max=100000, style=ProgressStyle(description_width…

INFO:absl:Saved checkpoint for step 70000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-14
INFO:absl:Saved checkpoint for step 75000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-15
INFO:absl:Saved checkpoint for step 80000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-16
INFO:absl:Saved checkpoint for step 85000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-17
INFO:absl:Saved checkpoint for step 90000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-18
INFO:absl:Saved checkpoint for step 95000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-19
INFO:absl:Saved checkpoint for step 100000: gs://lila_data/output_dir/un_mark_pass/tf_ckpts/ckpt-20
INFO:absl:Saving model checkpoint to /content/drive/My Drive/checkpoints/un_mark_pass/checkpoint-100000
Epoch:   0%|          | 0/1 [6:18:45<?, ?it/s]
INFO:absl:  Training took time = 6:19:03.575480





INFO:absl:Saving model to /content/drive/My Drive/checkpoints/un_mark_pass
