In [None]:
"""
Google cloud variables.
"""
GCLOUD_PROJECT=<GOOGLE_CLOUD_PROJECT>
GCS_BUCKET=<CLOUD_STORAGE_BUCKET>
GCS_MODEL_DIR='so-quality/t5-model/01'

## environment setup

- authentication for GCS accesss
- install packages
- tf distribution strategy

In [None]:
from google.colab import auth
auth.authenticate_user()

!gcloud config set project $GCLOUD_PROJECT

Updated property [core/project].


In [None]:
%pip install transformers==4.11.3
%pip install sentencepiece
%pip install -q -U tf-models-official



In [None]:
import os
import numpy as np
import tensorflow as tf
from official.nlp import optimization

if os.environ.get('COLAB_TPU_ADDR'):
  cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver(tpu='')
  tf.config.experimental_connect_to_cluster(cluster_resolver)
  tf.tpu.experimental.initialize_tpu_system(cluster_resolver)
  strategy = tf.distribute.TPUStrategy(cluster_resolver)
  print('Using TPU')
elif tf.config.list_physical_devices('GPU'):
  strategy = tf.distribute.MirroredStrategy()
  print('Using GPU')
else:
  strategy = tf.distribute.get_strategy()
  print('Running on CPU is not recommended.')

INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Clearing out eager caches


INFO:tensorflow:Initializing the TPU system: grpc://10.96.127.74:8470


INFO:tensorflow:Initializing the TPU system: grpc://10.96.127.74:8470


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Finished initializing TPU system.


INFO:tensorflow:Found TPU system:


INFO:tensorflow:Found TPU system:


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Cores: 8


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Workers: 1


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Num TPU Cores Per Worker: 8


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:localhost/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:CPU:0, CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:0, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:1, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:2, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:3, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:4, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:5, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:6, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU:7, TPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:TPU_SYSTEM:0, TPU_SYSTEM, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


INFO:tensorflow:*** Available Device: _DeviceAttributes(/job:worker/replica:0/task:0/device:XLA_CPU:0, XLA_CPU, 0, 0)


Using TPU


## dataset

With the TFRecord format the dataset can be read directly in the TPU nodes.

In [None]:
def tf_record_decoder(encoded):
    features = {
        "input_ids": tf.io.FixedLenFeature([512], tf.int64),
        "attention_mask": tf.io.FixedLenFeature([512], tf.int64),
        "labels": tf.io.FixedLenFeature([2], tf.int64),
        "decoder_attention_mask": tf.io.FixedLenFeature([2], tf.int64),
        "class": tf.io.FixedLenFeature([1], tf.int64),
    }
    tf_record = tf.io.parse_single_example(encoded, features)
    return tf_record

def dataset_prepare(dataset, batch_size=32, training=False):
    dataset = dataset.map(tf_record_decoder)
    dataset = dataset.cache().prefetch(buffer_size=tf.data.AUTOTUNE)

    if training:
        dataset = dataset.shuffle(1*1024)

    dataset = dataset.batch(batch_size)
    return dataset

In [None]:
ds_tr_train = tf.data.TFRecordDataset(f'gs://{GCS_BUCKET}/so-quality/dataset_t5_train.tfrecord')
ds_tr_valid = tf.data.TFRecordDataset(f'gs://{GCS_BUCKET}/so-quality/dataset_t5_valid.tfrecord')

In [None]:
with strategy.scope():
    ds_train = dataset_prepare(ds_tr_train, training=True)
    ds_valid = dataset_prepare(ds_tr_valid)

## Import pre-trained model

In [None]:
from transformers import T5Tokenizer, TFT5ForConditionalGeneration

with strategy.scope():
    tokenizer = T5Tokenizer.from_pretrained('t5-base')
    model = TFT5ForConditionalGeneration.from_pretrained('t5-base')

DEBUG:filelock:Attempting to acquire lock 140161378671824 on /root/.cache/huggingface/transformers/748a176e9d151dcad63a27974db8b8f665f286954cfbb77008ca42163419ff66.6a323429db2b09562cffdb9bc72d09d08bccbca1d832434b183b867864c30526.h5.lock
DEBUG:filelock:Lock 140161378671824 acquired on /root/.cache/huggingface/transformers/748a176e9d151dcad63a27974db8b8f665f286954cfbb77008ca42163419ff66.6a323429db2b09562cffdb9bc72d09d08bccbca1d832434b183b867864c30526.h5.lock


Downloading:   0%|          | 0.00/851M [00:00<?, ?B/s]

DEBUG:filelock:Attempting to release lock 140161378671824 on /root/.cache/huggingface/transformers/748a176e9d151dcad63a27974db8b8f665f286954cfbb77008ca42163419ff66.6a323429db2b09562cffdb9bc72d09d08bccbca1d832434b183b867864c30526.h5.lock
DEBUG:filelock:Lock 140161378671824 released on /root/.cache/huggingface/transformers/748a176e9d151dcad63a27974db8b8f665f286954cfbb77008ca42163419ff66.6a323429db2b09562cffdb9bc72d09d08bccbca1d832434b183b867864c30526.h5.lock
All model checkpoint layers were used when initializing TFT5ForConditionalGeneration.

All the layers of TFT5ForConditionalGeneration were initialized from the model checkpoint at t5-base.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFT5ForConditionalGeneration for predictions without further training.


### Workaround for a bug in {train,test}_step

There was an issue with using metrics with the Model train API which was fixed in https://github.com/huggingface/transformers/pull/14009


In [None]:
from tensorflow.python.keras.engine import data_adapter

def train_step(self, data):
    """
    A modification of Keras's default train_step that cleans up the printed metrics when we use a dummy loss.
    """
    # These are the only transformations `Model.fit` applies to user-input
    # data when a `tf.data.Dataset` is provided.
    data = data_adapter.expand_1d(data)
    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
    # These next two lines differ from the base method - they avoid issues when the labels are in
    # the input dict (and loss is computed internally)
    if y is None and "labels" in x:
        y = x["labels"]  # Stops confusion with metric computations
    # Run forward pass.
    with tf.GradientTape() as tape:
        y_pred = self(x, training=True)
        loss = self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    # Run backwards pass.
    self.optimizer.minimize(loss, self.trainable_variables, tape=tape)
    self.compiled_metrics.update_state(y, y_pred['logits'], sample_weight)
    # Collect metrics to return
    return_metrics = {}
    for metric in self.metrics:
        result = metric.result()
        if isinstance(result, dict):
            return_metrics.update(result)
        else:
            return_metrics[metric.name] = result
    # These next two lines are also not in the base method - they correct the displayed metrics
    # when we're using a dummy loss, to avoid a bogus "loss_loss" value being shown.
    if "loss" in return_metrics and "loss_loss" in return_metrics:
        del return_metrics["loss_loss"]
    return return_metrics

def test_step(self, data):
    """
    A modification of Keras's default test_step that cleans up the printed metrics when we use a dummy loss.
    """
    data = data_adapter.expand_1d(data)
    x, y, sample_weight = data_adapter.unpack_x_y_sample_weight(data)
    # These next two lines differ from the base method - they avoid issues when the labels are in
    # the input dict (and loss is computed internally)
    if y is None and "labels" in x:
        y = x["labels"]  # Stops confusion with metric computations
    y_pred = self(x, training=False)
    if not self.loss:
        self.loss_tracker.update_state(y_pred.loss)
        return_metrics = {"loss": self.loss_tracker.result()}
    else:
        # Run anyway to update state
        return_metrics = {}
    # Updates stateful loss metrics.
    self.compiled_loss(y, y_pred, sample_weight, regularization_losses=self.losses)
    self.compiled_metrics.update_state(y, y_pred['logits'], sample_weight)
    # Collect metrics to return
    for metric in self.metrics:
        result = metric.result()
        if isinstance(result, dict):
            return_metrics.update(result)
        else:
            return_metrics[metric.name] = result
    # These next two lines are also not in the base method - they correct the displayed metrics
    # when we're using a dummy loss, to avoid a bogus "loss_loss" value being shown.
    if "loss" in return_metrics and "loss_loss" in return_metrics:
        del return_metrics["loss_loss"]
    return return_metrics


In [None]:
import functools
model.train_step = functools.partial(train_step, model)
model.test_step = functools.partial(test_step, model)

## Define an class accuracy metric

Save the model weights that achieve the highest accuracy on the test set. Used as a form of regularization. 

In [None]:
def _accuracy(y_true, y_pred):
    return tf.keras.metrics.sparse_categorical_accuracy(y_true[:, 0], y_pred[:, 0])

class ClassificationAccuracy(tf.keras.metrics.MeanMetricWrapper):
  def __init__(self, name='accuracy', **kwargs):
    super().__init__(_accuracy, name=name, **kwargs)


## Model compile

- sets up training hyperparameters (learning rate)
- Unwrapping the `model.loss` dictionary is done so that `save_weights` works correctly. Otherwise `save_weights` throws an exception that a trackable has been modified.

In [None]:
epochs = 50
batch_size = 32
init_lr = 1e-5

steps_per_epoch = 1406
num_train_steps = steps_per_epoch * epochs
num_warmup_steps = num_train_steps // 10

with strategy.scope():

    optimizer = optimization.create_optimizer(
        init_lr=init_lr,
        num_train_steps=num_train_steps,
        num_warmup_steps=num_warmup_steps,
        optimizer_type='adamw')

    model.compile(optimizer=optimizer, metrics=[ClassificationAccuracy()])
    model.loss = dict(model.loss)

INFO:absl:using Adamw optimizer
INFO:absl:gradient_clip_norm=1.000000
No loss specified in compile() - the model's internal loss computation will be used as the loss. Don't panic - this is a common way to train TensorFlow models in Transformers! Please ensure your labels are passed as the 'labels' key of the input dict so that they are accessible to the model during the forward pass. To disable this behaviour, please pass a loss argument, or explicitly pass loss=None if you do not want your model to compute a loss.


## Train

- The last batch with 8 examples, rather than 32, generates a NaN loss on TPU (but not on CPU). Use only the full batches.
- Use only a subset of the validation set since in order to save on computation costs.

In [None]:
checkpoint_filepath = f'gs://{GCS_BUCKET}/{GCS_MODEL_DIR}/checkpoint'

model_checkpoint_callback = tf.keras.callbacks.ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True,
    monitor='val_accuracy',
    mode='max',
    save_best_only=True)


with strategy.scope():
    history = model.fit(
                x=ds_train.take(1406),
                validation_data=ds_valid.take(200),
                callbacks=[model_checkpoint_callback],
                epochs=epochs)

model.save_pretrained('t5-model')

Epoch 1/50


INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 2) dtype=int64>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_4:0' shape=(None, 2) dtype=int64>]
INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 2) dtype=int64>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_4:0' shape=(None, 2) dtype=int64>]


   1406/Unknown - 463s 229ms/step - loss: 6.5669 - accuracy: 0.1802

INFO:absl:TPU has inputs with dynamic shapes: [<tf.Tensor 'Const:0' shape=() dtype=int32>, <tf.Tensor 'cond_8/Identity:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_1:0' shape=(None, 1) dtype=int64>, <tf.Tensor 'cond_8/Identity_2:0' shape=(None, 2) dtype=int64>, <tf.Tensor 'cond_8/Identity_3:0' shape=(None, 512) dtype=int64>, <tf.Tensor 'cond_8/Identity_4:0' shape=(None, 2) dtype=int64>]


Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
!gsutil rsync t5-model gs://$GCS_BUCKET/$GCS_MODEL_DIR/

Building synchronization state...
Starting synchronization...
Copying file://t5-model/config.json [Content-Type=application/json]...
Copying file://t5-model/tf_model.h5 [Content-Type=application/octet-stream]...
==> NOTE: You are uploading one or more large file(s), which would run
significantly faster if you enable parallel composite uploads. This
feature can be enabled by editing the
"parallel_composite_upload_threshold" value in your .boto
configuration file. However, note that if you do this large files will
be uploaded as `composite objects
<https://cloud.google.com/storage/docs/composite-objects>`_,which
means that any user who downloads such objects will need to have a
compiled crcmod installed (see "gsutil help crcmod"). This is because
without a compiled crcmod, computing checksums on composite objects is
so slow that gsutil disables downloads of composite objects.

- [2 files][850.8 MiB/850.8 MiB]                                                
Operation completed over 2 obje