# Set Up

<h3><a href="https://cloud.google.com/tpu/"><img valign="middle" src="https://raw.githubusercontent.com/GoogleCloudPlatform/tensorflow-without-a-phd/master/tensorflow-rl-pong/images/tpu-hexagon.png" width="50"></a>  &nbsp;&nbsp;Train on TPU</h3>




   1. Create a Cloud Storage bucket for your data and model checkpoints at http://console.cloud.google.com/storage, and fill in the `BASE_DIR` parameter in the following form. There is a [free tier](https://cloud.google.com/free/) if you do not yet have an account.
 
   1. On the main menu, click Runtime and select **Change runtime type**. Set "TPU" as the hardware accelerator.
   1. Run the following cell and follow instructions to:
    *  Set up a Colab TPU running environment
    *   Verify that you are connected to a TPU device
    *   Upload your credentials to TPU to access your GCS bucket


In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -q t5

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5
import t5.models
import seqio

BASE_DIR = "gs://t5_bucket_project" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")
ON_CLOUD = True


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v2-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.enable_eager_execution()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Installing dependencies...
[K     |████████████████████████████████| 153 kB 5.3 MB/s 
[K     |████████████████████████████████| 1.2 MB 50.6 MB/s 
[K     |████████████████████████████████| 366 kB 42.1 MB/s 
[K     |████████████████████████████████| 90 kB 6.9 MB/s 
[K     |████████████████████████████████| 4.4 MB 33.6 MB/s 
[K     |████████████████████████████████| 3.1 MB 42.0 MB/s 
[K     |████████████████████████████████| 4.0 MB 35.8 MB/s 
[K     |████████████████████████████████| 286 kB 40.3 MB/s 
[K     |████████████████████████████████| 3.3 MB 44.9 MB/s 
[K     |████████████████████████████████| 596 kB 71.5 MB/s 
[K     |████████████████████████████████| 56 kB 3.7 MB/s 
[K     |████████████████████████████████| 895 kB 55.0 MB/s 
[?25hSetting up GCS access...
Running on TPU: grpc://10.43.244.178:8470
Instructions for updating:
non-resource variables are not supported in the long term


Instructions for updating:
non-resource variables are not supported in the long term


In [None]:
ds = tfds.load(
    "super_glue/rte",
    data_dir=DATA_DIR,
    # Download data locally for preprocessing to avoid using GCS space.
    download_and_prepare_kwargs={"download_dir": "./super_glue"})
print("A few raw validation examples...")
for ex in tfds.as_numpy(ds["validation"].take(2)):
  print(ex)

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split None, from gs://t5_bucket_project/data/super_glue/rte/1.0.2


A few raw validation examples...
{'hypothesis': b'Bruce Springsteen is a singer.', 'idx': 63, 'label': 0, 'premise': b'Bruce Springsteen, with one arm outstretched, is singing in the spotlight in a dark concert hall.'}
{'hypothesis': b"The barber cuts a man's hair.", 'idx': 82, 'label': 1, 'premise': b'A family walking with a soldier.'}


In [None]:
def get_config(name):
  for b in tfds.text.super_glue.SuperGlue.builder_configs.values():
    if b.name == name:
      return b

config = get_config('rte')

In [None]:
import seqio
import t5.data
from t5.data import postprocessors
from t5.data import preprocessors
from t5.data.glue_utils import get_glue_metric
from t5.data.glue_utils import get_glue_postprocess_fn
from t5.data.glue_utils import get_glue_text_preprocessor
from t5.data.glue_utils import get_super_glue_metric
from t5.evaluation import metrics
import tensorflow_datasets as tfds

DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True,
        required=False),
    "targets": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}


seqio.TaskRegistry.add(
  "all_mix5",
  source=seqio.TfdsDataSource(
      tfds_name="super_glue/rte:1.0.2",
      tfds_data_dir=DATA_DIR),
  preprocessors=[
  get_glue_text_preprocessor(config),
  seqio.preprocessors.tokenize,
  seqio.CacheDatasetPlaceholder(),
  seqio.preprocessors.append_eos_after_trim,
      ],
metric_fns=get_glue_metric(config.name),
output_features=DEFAULT_OUTPUT_FEATURES,
postprocess_fn=get_glue_postprocess_fn(config))

<seqio.dataset_providers.Task at 0x7efde23db390>

In [None]:
nq_task = seqio.TaskRegistry.get("all_mix5")
ds = nq_task.get_dataset(split="validation", sequence_length={"inputs": 128, "targets": 32})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix5:validation'


A few preprocessed validation examples...
{'idx': 28, 'inputs_pretokenized': b"rte hypothesis: A man repairs bicycles. premise: A man looking over a bicycle's rear wheel in the maintenance garage with various tools visible in the background.", 'inputs': array([    3,    52,    17,    15, 22455,    10,    71,   388,  7384,
       12679,     7,     5,     3, 17398,    10,    71,   388,   479,
         147,     3,     9, 12679,    31,     7,  4091,  5094,    16,
           8,  2453,  3543,    28,   796,  1339,  5183,    16,     8,
        2458,     5,     1], dtype=int32), 'targets_pretokenized': b'entailment', 'targets': array([   3,   35, 5756,  297,    1], dtype=int32)}
{'idx': 65, 'inputs_pretokenized': b'rte hypothesis: Boys are playing chess. premise: The boys are playing with Legos.', 'inputs': array([    3,    52,    17,    15, 22455,    10, 16575,    33,  1556,
           3,  2951,     7,     5,     3, 17398,    10,    37,  5234,
          33,  1556,    28, 23249,     7,     5,  

## Define Model

In [None]:
MODEL_SIZE = "base" #@param["small", "base", "large", "3B", "11B"]
# Public GCS path for T5 pre-trained model checkpoints
BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)
MODEL_DIR = os.path.join(MODELS_DIR, MODEL_SIZE)

if ON_CLOUD and MODEL_SIZE == "3B":
  tf.logging.warning(
      "The `3B` model is too large to use with the 5GB GCS free tier. "
      "Make sure you have at least 25GB on GCS before continuing."
  )
elif ON_CLOUD and MODEL_SIZE == "11B":
  raise ValueError(
      "The `11B` parameter is too large to fine-tune on the `v2-8` TPU "
      "provided by Colab. Please comment out this Error if you're running "
      "on a larger TPU."
  )

# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 128, "targets": 32},
    learning_rate_schedule=0.003,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

Before we continue, let's load a [TensorBoard](https://www.tensorflow.org/tensorboard) visualizer so that we can keep monitor our progress. The page should automatically update as fine-tuning and evaluation proceed.

In [None]:
# if ON_CLOUD:
#   %reload_ext tensorboard
# %tensorboard --logdir="$MODEL_DIR" --port=0

## Fine-tune

We are now ready to fine-tune our model. This will take a while (~2 hours with default settings), so please be patient! The larger the model and more `FINETUNE_STEPS` you use, the longer it will take.

Don't worry, you can always come back later and increase the number of steps, and it will automatically pick up where you left off.

In [None]:
FINETUNE_STEPS = 2500 #@param {type: "integer"}

model.finetune(
    mixture_or_task_name="all_mix",
    pretrained_model_dir=PRETRAINED_DIR,
    finetune_steps=FINETUNE_STEPS
)

INFO:root:system_path_file_exists:gs://t5-data/pretrained_models/base/operative_config.gin
ERROR:root:Path not found: gs://t5-data/pretrained_models/base/operative_config.gin
INFO:root:Skipping import of unknown module `t5.data.sentencepiece_vocabulary` (skip_unknown=True).


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

## Evaluate

We now evaluate on the validation sets of the tasks in our mixture. Accuracy results will be logged and added to the TensorBoard above.

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="all_mix4",
    checkpoint_steps="all"
)

INFO:root:system_path_file_exists:gs://t5_bucket_project/models/base/operative_config.gin
ERROR:root:Path not found: gs://t5_bucket_project/models/base/operative_config.gin
INFO:absl:Adding task 'all_mix4' with predict metric_fn(s).
INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Skipping packing/padding for 'all_mix4' since sequence length is None.
INFO:absl:Setting sequence lengths to {'inputs': 73, 'targets': 6}
INFO:absl:Evaluating checkpoint step: 1002400


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 1002400: 91.500
INFO:absl:Evaluating checkpoint step: 1002400


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 1002400: 91.500
INFO:absl:Evaluating checkpoint step: 1002400


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 1002400: 91.500
INFO:absl:Evaluating checkpoint step: 1002400


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 1002400: 91.500
INFO:absl:Evaluating checkpoint step: 999900


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 999900: 86.000
INFO:absl:Evaluating checkpoint step: 999900


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 999900: 86.000
INFO:absl:Evaluating checkpoint step: 999900


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 999900: 86.000
INFO:absl:Evaluating checkpoint step: 999900


INFO:tensorflow:Using config: {'_model_dir': 'gs://t5_bucket_project/models/base', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5000, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.43.244.178:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.43.244.178:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.43.244.178:8470', '_evaluation_master': 'grpc://10.43.244.178:8470', '_i

INFO:absl:Load dataset info from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Reusing dataset super_glue (gs://t5_bucket_project/data/super_glue/rte/1.0.2)
INFO:absl:Constructing tf.data.Dataset super_glue for split validation, from gs://t5_bucket_project/data/super_glue/rte/1.0.2
INFO:absl:Automatically caching small dataset in memory: 'all_mix4:validation'
INFO:absl:Padding 'all_mix4' with sequence lengths: {'inputs': 73, 'targets': 6}


INFO:tensorflow:num_cores_per_replica: 1
INFO:tensorflow:computation_shape: [1, 1, 1, 1]
INFO:tensorflow:num_replicas: 8
INFO:tensorflow:device_assignment.topology.device_coordinates: [[[0 0 0 0]
  [0 0 0 1]
  [1 0 0 0]
  [1 0 0 1]
  [0 1 0 0]
  [0 1 0 1]
  [1 1 0 0]
  [1 1 0 1]]]
INFO:tensorflow:device_assignment.core_assignment: [[[0 0 0 0]]

 [[0 0 0 1]]

 [[1 0 0 0]]

 [[1 0 0 1]]

 [[0 1 0 0]]

 [[0 1 0 1]]

 [[1 1 0 0]]

 [[1 1 0 1]]]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[4, 2] physical_shape=[2, 2, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_shape=[2] physical_shape=[1, 1, 2]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1)]
INFO:tensorflow:auto_logical_to_physical_tpu logical_to_physical = [(0, 0, 0), (0, 0, 1), (0, 1, 0), (0, 1, 1), (1, 1, 0), (1, 1, 1), (1, 0, 0), (1, 0, 1)]
INFO:tensorflow:SimdMeshImpl init: Shape[batch=4, model=2] LayoutRules{('batch', 'batch'), ('experts', 'batch'), ('ensemble', 'ens

INFO:absl:eval/all_mix4/accuracy at step 999900: 86.000


Let's look at a few random predictions from the validation sets. Note that we measure accuracy based on an *exact match* of the predicted answer and the ground-truth answer. As a result, some of the answers are semantically correct but are counted wrong by the exact match score.

# Export Model for Serving

As mentioned in the previous section, exporting a [`SavedModel`](https://www.tensorflow.org/guide/saved_model) can be useful for improving performance during inference or allowing your model to be deployed on a variety of platforms (e.g., TFLite, TensorFlow.js, TensorFlow Serving, or TensorFlow Hub).

**Note:** we currently only support exporting a SavedModel that runs on both CPU and GPU, not TPU.

## Export SavedModel

We first export the SavedModel. We set a batch size of 1 for simplicity, but it may be more efficient to use a larger batch size if you want to handle multiple requests per call.

For 3B and 11B models the export will take approximately 30-45 minutes.

In [None]:
export_dir = os.path.join(MODEL_DIR, "export")

model.batch_size = 1 # make one prediction per call
saved_model_path = model.export(
    export_dir,
    checkpoint_step=-1,  # use most recent
    beam_size=1,  # no beam search
    temperature=1.0,  # sample according to predicted distribution
)
print("Model saved to:", saved_model_path)

In [None]:
import pandas as pd
import numpy as np

df = pd.read_csv('/content/drive/MyDrive/snli_1.0_test.csv')
df = df[['gold_label', 'sentence1', 'sentence2']]
df = df.rename(columns={"sentence1": "premise", "sentence2": "hypothesis"})
entailment = df.loc[df['gold_label'] == 'entailment'][:100].reset_index(drop=True)
entailment['label'] = np.full(shape=100, fill_value=1, dtype=np.int)
entailment = entailment.drop(columns=['gold_label'])
not_entailment = df.loc[df['gold_label'] == 'entailment'][150:250].reset_index(drop=True)
not_entailment['label'] = np.full(shape=100, fill_value=0, dtype=np.int)
not_entailment = not_entailment.drop(columns=['gold_label'])
df = pd.concat([entailment, not_entailment])
df['idx'] = df.index
from sklearn.utils import shuffle
df = shuffle(df)
df = df[['hypothesis', 'idx', 'label', 'premise']]
df

In [None]:
import tensorflow as tf

custome_dataset = (
    tf.data.Dataset.from_tensor_slices(
        (
            tf.cast(df[['hypothesis', 'premise']].values, tf.string),
            tf.cast(df[['idx', 'label']].values, tf.int32)
        )
    )
)

for features_tensor, target_tensor in custome_dataset:
    print(f'features:{features_tensor} target:{target_tensor}')
    break

## Re-run with custome dataset



In [None]:
# def nq_dataset_fn(ds):

#   return ds.map(to_inputs_and_targets, 
#                 num_parallel_calls=tf.data.experimental.AUTOTUNE)
ds = tfds.load(
  "custome",
  data_dir=DATA_DIR,
  # Download data locally for preprocessing to avoid using GCS space.
  download_and_prepare_kwargs={"download_dir": "./new"})
print("A few raw validation examples...")
for ex in tfds.as_numpy(ds.take(2)):
  print(ex)

In [None]:
import seqio
import t5.data
from t5.data import postprocessors
from t5.data import preprocessors
from t5.data.glue_utils import get_glue_metric
from t5.data.glue_utils import get_glue_postprocess_fn
from t5.data.glue_utils import get_glue_text_preprocessor
from t5.data.glue_utils import get_super_glue_metric
from t5.evaluation import metrics
import tensorflow_datasets as tfds

DEFAULT_OUTPUT_FEATURES = {
    "inputs": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True,
        required=False),
    "targets": seqio.Feature(
        vocabulary=t5.data.get_default_vocabulary(), add_eos=True)
}


seqio.TaskRegistry.add(
    "new",
    # Specify the task source.
    source=seqio.FunctionDataSource(
        # Supply a function which returns a tf.data.Dataset.
        dataset_fn=nq_dataset_fn,
        splits=["validation"],
        # Not required, but helps for mixing and auto-caching.
        num_input_examples=num_nq_examples),
    # Supply a list of functions that preprocess the input tf.data.Dataset
  preprocessors=[
    get_glue_text_preprocessor(config),
    seqio.preprocessors.tokenize,
    seqio.CacheDatasetPlaceholder(),
    seqio.preprocessors.append_eos_after_trim,
      ],
metric_fns=get_glue_metric(config.name),
output_features=DEFAULT_OUTPUT_FEATURES,
postprocess_fn=get_glue_postprocess_fn(config))

model_parallelism, train_batch_size, keep_checkpoint_max = (2, 128, 8)

# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="all_mix",
    checkpoint_steps="all"
)

In [None]:
# Use a larger batch size for evaluation, which requires less memory.
model.batch_size = train_batch_size * 4
model.eval(
    mixture_or_task_name="new_rte",
    checkpoint_steps="all"
)