In [2]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ''

import tensorflow as tf
import tensorflow_datasets as tfds
import t5
import functools
from t5 import models

# !wget https://f000.backblazeb2.com/file/malaya-model/bpe/sp10m.cased.ms-en.model
vocab = 'sp10m.cased.ms-en.model'

In [3]:
# !pip3 install t5==0.9.3 --no-deps

In [4]:
!pip3 freeze | grep 'tensorflow'

albert-tensorflow==1.1
bert-tensorflow==1.0.1
mesh-tensorflow==0.1.21
nvidia-tensorflow==1.15.5+nv22.5
rotary-embedding-tensorflow==0.1.1
tensorflow-addons==0.12.0
tensorflow-datasets==4.6.0
tensorflow-estimator==2.6.0
tensorflow-gpu==2.6.0
tensorflow-hub==0.12.0
tensorflow-io-gcs-filesystem==0.24.0
tensorflow-metadata==1.7.0
tensorflow-model-analysis==0.38.0
tensorflow-probability==0.13.0
tensorflow-serving-api==2.8.0
tensorflow-text==2.6.0


In [5]:
!pip3 freeze | grep 'keras'

keras==2.6.0


In [6]:
with tf.io.gfile.GFile('test.tsv', "w") as outfile:
    l = 'i like u'
    r = 'saya suka awak'
    outfile.write("%s\t%s\n" % (l, r))

In [7]:
def translation_dataset(split, shuffle_files=False):
    del shuffle_files
    ds = tf.data.TextLineDataset(
        [
            'test.tsv'
        ]
    )

    ds = ds.map(
        functools.partial(
            tf.io.decode_csv,
            record_defaults=['', ''],
            field_delim='\t',
            use_quote_delim=False,
        ),
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )
    ds = ds.map(lambda *ex: dict(zip(['question', 'answer'], ex)))
    return ds


def translation_preprocessor(ds):
    def to_inputs_and_targets(ex):
        return {
            'inputs': tf.strings.join(['terjemah Inggeris ke Melayu: ', ex['question']]),
            'targets': ex['answer'],
        }

    return ds.map(
        to_inputs_and_targets,
        num_parallel_calls=tf.data.experimental.AUTOTUNE,
    )

In [8]:
import seqio

DEFAULT_SPM_PATH = vocab
DEFAULT_EXTRA_IDS = 100


def get_default_vocabulary():
    return seqio.SentencePieceVocabulary(DEFAULT_SPM_PATH, DEFAULT_EXTRA_IDS)

In [9]:
t5.data.TaskRegistry.remove('translation_dataset')

t5.data.TaskRegistry.add(
    'translation_dataset',
    dataset_fn=translation_dataset,
    splits=['train'],
    text_preprocessor=[translation_preprocessor],
    postprocess_fn=t5.data.postprocessors.lower_text,
    metric_fns=[t5.evaluation.metrics.accuracy],
    output_features = seqio.Feature(get_default_vocabulary())
)
t5.data.MixtureRegistry.remove('translation_bahasa')
t5.data.MixtureRegistry.add(
    'translation_bahasa',
    ['translation_dataset'],
    default_rate=1.0,
)

<seqio.dataset_providers.Mixture at 0x7f414172f5b0>

In [10]:
nq_task = t5.data.TaskRegistry.get("translation_dataset")
ds = nq_task.get_dataset(split='knowledge-graph.tsv', sequence_length={"inputs": 512, "targets": 512})
r = tfds.as_numpy(ds)

2022-07-05 14:05:23.312228: E tensorflow/stream_executor/cuda/cuda_driver.cc:271] failed call to cuInit: CUDA_ERROR_NO_DEVICE: no CUDA-capable device is detected
2022-07-05 14:05:23.312255: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:169] retrieving CUDA diagnostic information for host: huseincomel-desktop
2022-07-05 14:05:23.312258: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:176] hostname: huseincomel-desktop
2022-07-05 14:05:23.312301: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:200] libcuda reported version is: Not found: was unable to find libcuda.so DSO loaded into this program
2022-07-05 14:05:23.312319: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:204] kernel reported version is: 470.129.6
2022-07-05 14:05:23.312488: I tensorflow/core/platform/cpu_feature_guard.cc:142] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F 

In [11]:
next(r._make_iterator_fn())

2022-07-05 14:05:23.797967: I tensorflow/compiler/mlir/mlir_graph_optimization_pass.cc:185] None of the MLIR Optimization Passes are enabled (registered 2)


{'inputs_pretokenized': b'terjemah Inggeris ke Melayu: i like u',
 'inputs': array([   13, 26087,  2040,    55,  1550,    31,    13,    91,   164,
           13,   354,     1], dtype=int32),
 'targets_pretokenized': b'saya suka awak',
 'targets': array([   67,  1259, 12367,     1], dtype=int32)}

In [12]:
model_parallelism, train_batch_size, keep_checkpoint_max = 1, 2, 5

In [13]:
BASE_DIR = 'out'
model = t5.models.MtfModel(
    model_dir=BASE_DIR,
    tpu=None,
    tpu_topology=None,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={'inputs': 512, 'targets': 512},
    learning_rate_schedule=0.0005,
    save_checkpoints_steps=5,
    keep_checkpoint_max=5,
    iterations_per_loop=100,
)

In [14]:
# !wget https://f000.backblazeb2.com/file/malaya-model/pretrained/t5-tiny-social-media-2021-11-15.tar.gz
# !tar -zxf t5-tiny-social-media-2021-11-15.tar.gz
!ls t5-tiny-social-media

checkpoint				model.ckpt-1000000.index
model.ckpt-1000000.data-00000-of-00002	model.ckpt-1000000.meta
model.ckpt-1000000.data-00001-of-00002	operative_config.gin


In [15]:
FINETUNE_STEPS = 50000
MODEL_DIR = 't5-tiny-social-media'

In [16]:
with open('t5-tiny-social-media/checkpoint', 'w') as fopen:
    fopen.write('model_checkpoint_path: "model.ckpt-1000000"')
    
tf.train.latest_checkpoint('t5-tiny-social-media')

't5-tiny-social-media/model.ckpt-1000000'

In [None]:
model.finetune(
    mixture_or_task_name='translation_dataset',
    pretrained_model_dir=MODEL_DIR,
    finetune_steps=FINETUNE_STEPS,
)

INFO:tensorflow:Using config: {'_model_dir': 'out', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_na

INFO:tensorflow:Using config: {'_model_dir': 'out', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': 5, '_save_checkpoints_secs': None, '_session_config': allow_soft_placement: true
graph_options {
  rewrite_options {
    meta_optimizer_iterations: ONE
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': '', '_evaluation_master': '', '_is_chief': True, '_num_ps_replicas': 0, '_num_worker_replicas': 1, '_tpu_config': TPUConfig(iterations_per_loop=100, num_shards=None, num_cores_per_replica=1, per_host_input_for_training=4, tpu_job_na

INFO:tensorflow:_TPUContext: eval_on_tpu True


INFO:tensorflow:_TPUContext: eval_on_tpu True






Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.


Instructions for updating:
Use Variable.read_value. Variables in 2.X are initialized automatically both in eager and graph (inside tf.defun) contexts.
