### Creation of the environment

In [None]:
%tensorflow_version 2.x
!pip3 install --upgrade pip
#!pip install -qU t5
!pip3 install git+https://github.com/google-research/text-to-text-transfer-transformer.git #extra_id_x support

import functools
import os
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5

#Set the base dir(Google cloud bucket)
BASE_DIR = "gs://bucket_code_completion" 

if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
ON_CLOUD = True


if ON_CLOUD:
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "2x2"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime; please see the previous cell in this notebook for instructions!')
  auth.authenticate_user()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()

tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  og_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(og_level)

Collecting pip
[?25l  Downloading https://files.pythonhosted.org/packages/de/47/58b9f3e6f611dfd17fb8bd9ed3e6f93b7ee662fb85bdfee3565e8979ddf7/pip-21.0-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 6.9MB/s 
[?25hInstalling collected packages: pip
  Found existing installation: pip 19.3.1
    Uninstalling pip-19.3.1:
      Successfully uninstalled pip-19.3.1
Successfully installed pip-21.0
Collecting git+https://github.com/google-research/text-to-text-transfer-transformer.git
  Cloning https://github.com/google-research/text-to-text-transfer-transformer.git to /tmp/pip-req-build-paaxxk36
  Running command git clone -q https://github.com/google-research/text-to-text-transfer-transformer.git /tmp/pip-req-build-paaxxk36
Collecting mesh-tensorflow[transformer]>=0.1.13
  Downloading mesh_tensorflow-0.1.18-py3-none-any.whl (361 kB)
[K     |████████████████████████████████| 361 kB 6.7 MB/s 
Collecting rouge-score
  Downloading rouge_score-0.0.4-py2.py3-none-any.whl

Instructions for updating:
non-resource variables are not supported in the long term


### Path to csv file
This variable contains the path to the tsv file for training loaded on the bucket. Please be sure to insert the correct path

In [None]:
nq_tsv_path = {
    "train":'gs://bucket_code_completion/T5_extension/data/code.tsv',
    "validation":'gs://bucket_code_completion/T5_extension/data/code.tsv',
}

### Preprocess of the dataset
In this step we preprocess the dataset.  
You have to change the path to vocab files (*vocab_model_path* and *vocab_path*)


In [None]:
from t5.data import postprocessors as t5_postprocessors
from t5.seqio import Feature,SentencePieceVocabulary


# # Set the path of sentencepiece model and vocab files
vocab_model_path = 'gs://bucket_code_completion/T5_extension/code.model'
vocab_path = 'gs://bucket_code_completion/T5_extension/code.vocab'


TaskRegistry = t5.data.TaskRegistry
TfdsTask = t5.data.TfdsTask


def get_default_vocabulary():
  return SentencePieceVocabulary(vocab_model_path, 100)

DEFAULT_OUTPUT_FEATURES = {
    "inputs": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True, required=False),

    "targets": Feature(
        vocabulary=get_default_vocabulary(), add_eos=True)
}

In [None]:
def nq_dataset_fn(split, shuffle_files=True):
  # We only have one file for each split.
  del shuffle_files

   # Load lines from the text file as examples.

  ds = tf.data.TextLineDataset(nq_tsv_path[split])
  ds = ds.map(
      functools.partial(tf.io.decode_csv, record_defaults=["string","string"],
                        field_delim="\t", use_quote_delim=False),
      num_parallel_calls=tf.data.experimental.AUTOTUNE)
  
  ds = ds.map(lambda *ex: dict(zip(["input", "output"], ex)))
  return ds

print("A few raw train examples...")
for ex in tfds.as_numpy(nq_dataset_fn("train").take(5)):
  print(ex)

A few raw train examples...
{'input': b'@Deprecated public String encrypt(long.<extra_id_0> numbers) { return<extra_id_1>(numbers); }', 'output': b'<extra_id_0>..<extra_id_1> encode<extra_id_2>'}
{'input': b'@Deprecated public long[] decrypt(String hash<extra_id_0> { return decode(hash);<extra_id_1>', 'output': b'<extra_id_0>)<extra_id_1> }<extra_id_2>'}
{'input': b'@<extra_id_0> public String<extra_id_1>(String hexa) { return encodeHex(hexa); }', 'output': b'<extra_id_0>Deprecated<extra_id_1> encryptHex<extra_id_2>'}
{'input': b'<extra_id_0>Deprecated public String decryptHex(String hash) {<extra_id_1> decodeHex(hash); }', 'output': b'@<extra_id_0> return<extra_id_1>'}
{'input': b'public String encode<extra_id_0>long... numbers)<extra_id_1> if (<extra_id_2>.length == 0) { return ""; } for (<extra_id_3> number : numbers<extra_id_4> (number < 0) { return ""; } if (number <extra_id_5> MA \xe2\x81\x87 <extra_id_6>NUMBER) { throw<extra_id_7> IllegalArgument<extra_id_8>number can not be gre

In [None]:
def preprocessing(ds):
  
  def to_inputs_and_targets(ex):

        inputs = tf.strings.join([ ex['input']], separator=' ')
        class_label = tf.strings.join([ex['output']], separator=' ')
        return {'inputs': inputs, 'targets': class_label }
    
  return ds.map(to_inputs_and_targets, num_parallel_calls=tf.data.experimental.AUTOTUNE)

In [None]:
#Create a new training task
t5.data.TaskRegistry.remove('pretraining')
t5.data.TaskRegistry.add(
    "pretraining",
    dataset_fn=nq_dataset_fn,
    splits=["train", "validation"],
    text_preprocessor=[preprocessing],
    output_features = DEFAULT_OUTPUT_FEATURES,
    metric_fns=[t5.evaluation.metrics.accuracy],
)

<t5.data.dataset_providers.FunctionTask at 0x7f6d9a3031d0>

In [None]:
nq_task = t5.data.TaskRegistry.get("pretraining")
ds = nq_task.get_dataset(split="train", sequence_length={"inputs": 256, "targets": 256})
print("A few preprocessed training examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


A few preprocessed training examples...
{'inputs_pretokenized': b'@Override public int<extra_id_0>getRowCount<extra_id_1> { return ucm == null \xe2\x81\x87 ucm.certs == null \xe2\x81\x87 0 : <extra_id_2>m<extra_id_3>cert<extra_id_4>.size(); }', 'inputs': array([   19,    27,    12,    35,    25,  4837,    15,   111,    15,
         148,    29,  7507,    25,  4837,    15,   111,  1236,    29,
           7,    14,     3,  6020,    87,    40,    30,     3,     2,
           3,  6020,    87,     4,  6299,    22,    40,    30,     3,
           2,   157,    58, 32097,    87,    25,  4837,    15,   111,
          15,     2,    29,  6299,    25,  4837,    15,   111,    15,
           2,    29,     4,   134,    18,     6,     1], dtype=int32), 'targets_pretokenized': b'<extra_id_0> <extra_id_1>()<extra_id_2>uc<extra_id_3>.<extra_id_4>s<extra_id_5>', 'targets': array([32099, 32098,    16,    25,  4837,    15,   111,    15,  7899,
        6020,    25,  4837,    15,   111,    15,     2,    29,   

### Pretraining of the model
You can pretrain the model running the following two cells.  
Please set the correct path of the variable *MODEL_DIR* (the path to save the pretrained model in) and *PATH_GIN_FILE* (the gin file configuration for the pre-training)

In [None]:
from mesh_tensorflow.transformer.learning_rate_schedules import learning_rate_schedule_noam

#See https://github.com/google-research/text-to-text-transfer-transformer if you want to scale up the model
MODEL_SIZE = "small"  

MODEL_DIR = 'gs://bucket_code_completion/T5_extension/pretrained_with_masking'


model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]


tf.io.gfile.makedirs(MODEL_DIR)

model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 256, "targets": 256},
    learning_rate_schedule = learning_rate_schedule_noam,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None
)

In [None]:
PATH_GIN_FILE = 'gs://bucket_code_completion/T5_extension/pretrain_config/operative_config.gin'
import gin
with gin.unlock_config():
    gin.parse_config_file(PATH_GIN_FILE)
    TRAIN_STEPS = 200000
    model.train("pretraining", steps=TRAIN_STEPS)

INFO:root:system_path_file_exists:gs://bucket_comment_completion/Matteo/pretrain_config/operative_config.gin
ERROR:root:Path not found: gs://bucket_comment_completion/Matteo/pretrain_config/operative_config.gin


INFO:tensorflow:Using config: {'_model_dir': 'gs://bucket_comment_completion/Matteo/pretrained_with_masking', '_tf_random_seed': None, '_save_summary_steps': 100, '_save_checkpoints_steps': None, '_save_checkpoints_secs': None, '_session_config': graph_options {
  rewrite_options {
    disable_meta_optimizer: true
  }
}
cluster_def {
  job {
    name: "worker"
    tasks {
      key: 0
      value: "10.108.201.82:8470"
    }
  }
}
, '_keep_checkpoint_max': 5, '_keep_checkpoint_every_n_hours': 10000, '_log_step_count_steps': None, '_train_distribute': None, '_device_fn': None, '_protocol': None, '_eval_distribute': None, '_experimental_distribute': None, '_experimental_max_worker_delay_secs': None, '_session_creation_timeout_secs': 7200, '_checkpoint_save_graph_def': True, '_service': None, '_cluster_spec': ClusterSpec({'worker': ['10.108.201.82:8470']}), '_task_type': 'worker', '_task_id': 0, '_global_id_in_cluster': 0, '_master': 'grpc://10.108.201.82:8470', '_evaluation_master': 'grpc

  _tokenize, num_parallel_calls=tf.data.experimental.AUTOTUNE)


[1;30;43mStreaming output truncated to the last 5000 lines.[0m
INFO:tensorflow:global_step/sec: 3.80291
INFO:tensorflow:examples/sec: 973.544
INFO:tensorflow:Enqueue next (100) batch(es) of data to infeed.
INFO:tensorflow:Dequeue next (100) batch(es) of data from outfeed.
INFO:tensorflow:loss = 0.0099487305, step = 110600 (26.295 sec)
INFO:tensorflow:global_step/sec: 3.80302
INFO:tensorflow:examples/sec: 973.574
INFO:tensorflow:Enqueue next (100) batch(es) of data to infeed.
INFO:tensorflow:Dequeue next (100) batch(es) of data from outfeed.
INFO:tensorflow:Outfeed finished for iteration (646, 14)
INFO:tensorflow:loss = 0.01574707, step = 110700 (26.294 sec)
INFO:tensorflow:global_step/sec: 3.80307
INFO:tensorflow:examples/sec: 973.586
INFO:tensorflow:Enqueue next (100) batch(es) of data to infeed.
INFO:tensorflow:Dequeue next (100) batch(es) of data from outfeed.
INFO:tensorflow:loss = 0.012023926, step = 110800 (26.295 sec)
INFO:tensorflow:global_step/sec: 3.80302
INFO:tensorflow:ex