In [None]:
%%bash
[ ! -d "/content/code-t5" ] && git clone 'https://github.com/bzz/code-t5.git'
cd code-t5/
git pull origin master

In [None]:
print("Installing dependencies...")
%tensorflow_version 2.x
!pip install -qr code-t5/requirements-train.txt

In [None]:
import os

BASE_DIR = "gs://t5-codex" #@param { type: "string" }
if not BASE_DIR or BASE_DIR == "gs://":
  raise ValueError("You must enter a BASE_DIR.")
DATA_DIR = os.path.join(BASE_DIR, "data")
MODELS_DIR = os.path.join(BASE_DIR, "models")

MODEL_SIZE = "large" #@param["small", "base", "large", "3B", "11B"]
MODEL_DIR = os.path.join(MODELS_DIR, MODEL_SIZE)

ON_CLOUD = True

TRAIN_STEPS = 30000 #@param {type: "integer"}


In [None]:
import functools
import time
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

import tensorflow.compat.v1 as tf
import tensorflow_datasets as tfds

import t5
import t5.models
import seqio


if ON_CLOUD:
  print("Setting up GCS access...")
  import tensorflow_gcs_config
  from google.colab import auth
  # Set credentials for GCS reading/writing from Colab and TPU.
  TPU_TOPOLOGY = "v2-8"
  try:
    tpu = tf.distribute.cluster_resolver.TPUClusterResolver()  # TPU detection
    TPU_ADDRESS = tpu.get_master()
    print('Running on TPU:', TPU_ADDRESS)
  except ValueError:
    raise BaseException('ERROR: Not connected to a TPU runtime')
  auth.authenticate_user()
  tf.enable_eager_execution()
  tf.config.experimental_connect_to_host(TPU_ADDRESS)
  tensorflow_gcs_config.configure_gcs_from_colab_auth()


tf.disable_v2_behavior()

# Improve logging.
from contextlib import contextmanager
import logging as py_logging

if ON_CLOUD:
  tf.get_logger().propagate = False
  py_logging.root.setLevel('INFO')

@contextmanager
def tf_verbosity_level(level):
  log_level = tf.logging.get_verbosity()
  tf.logging.set_verbosity(level)
  yield
  tf.logging.set_verbosity(log_level)

In [None]:
import sys
sys.path.insert(0,'/content/code-t5')

In [None]:
seqio.TaskRegistry.names()

In [None]:
import codeT5.tasks

py5k_lm = seqio.TaskRegistry.get("py_50stars_2019")
ds = py5k_lm.get_dataset(split="validation", sequence_length={"inputs": 128, "targets": 32})
print("A few preprocessed validation examples...")
for ex in tfds.as_numpy(ds.take(5)):
  print(ex)

In [None]:
# Public GCS path for T5 pre-trained model checkpoints
BASE_PRETRAINED_DIR = "gs://t5-data/pretrained_models"
PRETRAINED_DIR = os.path.join(BASE_PRETRAINED_DIR, MODEL_SIZE)


In [None]:
gin_file=["models/shared-prefix_lm.gin"]
gin_param=None

if ON_CLOUD and MODEL_SIZE == "3B":
  tf.logging.warning(
      "The `3B` model is too large to use with the 5GB GCS free tier. "
      "Make sure you have at least 25GB on GCS before continuing."
  )
elif ON_CLOUD and MODEL_SIZE == "11B":
  raise ValueError(
      "The `11B` parameter is too large to fine-tune on the `v2-8` TPU "
      "provided by Colab. Please comment out this Error if you're running "
      "on a larger TPU."
  )

# Set parallelism and batch size to fit on v2-8 TPU (if possible).
# Limit number of checkpoints to fit within 5GB (if possible).
model_parallelism, train_batch_size, keep_checkpoint_max = {
    "small": (1, 256, 16),
    "base": (2, 128, 8),
    "large": (8, 64, 4),
    "3B": (8, 16, 1),
    "11B": (8, 16, 1)}[MODEL_SIZE]

import importlib
importlib.import_module('codeT5')

import gin
import pkg_resources
from t5.models import mesh_transformer


gin.add_config_file_search_path(
    pkg_resources.resource_filename("codeT5", "gin"))

skip_unknown=mesh_transformer.DEPRECATED_GIN_REFERENCES
gin.parse_config_files_and_bindings(
    gin_file, gin_param,
    skip_unknown=skip_unknown
)
  
# We must overide this binding explicitly since it is set to a deprecated
# function or class in many existing configs.
gin.bind_parameter("run.vocabulary", mesh_transformer.get_vocabulary())
gin.finalize()


In [None]:
tf.io.gfile.makedirs(MODEL_DIR)
# The models from our paper are based on the Mesh Tensorflow Transformer.
model = t5.models.MtfModel(
    model_dir=MODEL_DIR,
    tpu=TPU_ADDRESS,
    tpu_topology=TPU_TOPOLOGY,
    model_parallelism=model_parallelism,
    batch_size=train_batch_size,
    sequence_length={"inputs": 512, "targets": 512},
    learning_rate_schedule=0.003,
    save_checkpoints_steps=5000,
    keep_checkpoint_max=keep_checkpoint_max if ON_CLOUD else None,
    iterations_per_loop=100,
)

In [None]:
pip install -U tensorboard-plugin-profile "cloud-tpu-profiler>=2.3.0"

In [None]:
if ON_CLOUD:
  %reload_ext tensorboard
%tensorboard --logdir="$MODEL_DIR"

In [None]:
!kill 924

In [None]:
10.126.165.74:8466

In [None]:
model.train(
    mixture_or_task_name="py5k_prefix_lm",
    steps=TRAIN_STEPS
)

In [None]:
 !pwd

# Cache

Cache dataset (depends on Apache Beam)

In [None]:
!pip install apache-beam[gcp] python-snappy

In [None]:
# works only with TextLineDataSource
!cd code-t5 && python -m seqio.scripts.cache_tasks_main \
 --tasks=py_50stars_top5k_2019 \
 --module_import=codeT5.tasks \
 --output_cache_dir='gs://t5-codex/cache' \
 --alsologtostderr

# Train

## Uncached

In [None]:
# no cache, v2-8, model_parallelism = 2
!echo "$TPU_TOPOLOGY"
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_param="utils.tpu_mesh_shape.model_parallelism = 2" \
  --batch_size=128 \ # default tokens_per_replica=2048, 2048*8/512 = 32 seq/batch is overriden by tokens_per_batch = 65556
  --gin_param="run.train_steps = $TRAIN_STEPS" \
  --gin_param="run.keep_checkpoint_max = 8" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="MIXTURE_NAME = 'py5k_prefix_lm'"

# models/shared-prefix_lm.gin
#  --gcp_project="${PROJECT}" \
#  --tpu_zone="${ZONE}" \


In [None]:
# no cache, v2-8, model_parallelism = 1
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_param="utils.tpu_mesh_shape.model_parallelism = 1" \
  --gin_param="run.train_steps = 16000" \
  --gin_param="run.keep_checkpoint_max = 8" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="MIXTURE_NAME = 'py5k_prefix_lm'"


## Cached

### Base

Train 2xBERT-base 220M param model (Total size: 138M) on top5k repos with >50 stars dataset (~400M tokes).

In [None]:
#cache, v2-8, model_parallelism = 1
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --additional_task_cache_dirs='$BASE_DIR/cache' \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_param="utils.tpu_mesh_shape.model_parallelism = 1" \
  --gin_param="run.train_steps = $TRAIN_STEPS" \
  --gin_param="run.keep_checkpoint_max = 8" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="MIXTURE_NAME = 'py_50stars_top5k_2019'" \
  --gin_param="mesh_train_dataset_fn.use_cached = True"


In [None]:
# cache, v2-8, model_parallelism = 2
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --additional_task_cache_dirs='$BASE_DIR/cache' \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_param="utils.tpu_mesh_shape.model_parallelism = 2" \
  --gin_param="run.train_steps = $TRAIN_STEPS" \
  --gin_param="run.keep_checkpoint_max = 8" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="MIXTURE_NAME = 'py_50stars_top5k_2019'" \
  --gin_param="mesh_train_dataset_fn.use_cached = True"

# utils.run batch_size = tokens_per_replica=2048, 2048*8/512 = 32 seq/batch, wich is overriden by tokens_per_batch = 65556



10.4.42.106:8466

### Large

Train a larger model, 2xBERT-large 770M param (Total size: 436M), on bigger dataset (2.1B tokens)

In [None]:
# cache, v2-8, model_parallelism = 2
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --additional_task_cache_dirs='$BASE_DIR/cache' \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_file="models/bi_bert_large.gin" \
  --gin_param="run.train_steps = $TRAIN_STEPS" \
  --gin_param="run.keep_checkpoint_max = 8" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="MIXTURE_NAME = 'py_50stars_2019'" \
  --gin_param="mesh_train_dataset_fn.use_cached = True"


# Evaluate

In [None]:
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_file="eval.gin" \
  --gin_file="beam_search.gin" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="split = 'validation'" \
  --gin_param="eval_checkpoint_step = -1" \
  --gin_param="MIXTURE_NAME = 'py5k_prefix_lm'"


In [None]:
!cd code-t5/ && python -m t5.models.mesh_transformer_main  \
  --tpu="$TPU_ADDRESS" \
  --model_dir="$MODEL_DIR" \
  --t5_tfds_data_dir="$DATA_DIR" \
  --module_import="codeT5.tasks" \
  --gin_location_prefix="codeT5/gin/" \
  --gin_file="models/shared-prefix_lm.gin" \
  --gin_file="models/bi_bert_large.gin" \
  --gin_file="perplexity_eval.gin" \
  --gin_file="beam_search.gin" \
  --gin_param="utils.tpu_mesh_shape.tpu_topology = '$TPU_TOPOLOGY'" \
  --gin_param="split = 'validation'" \
  --gin_param="eval_checkpoint_step = -1" \
  --gin_param="MIXTURE_NAME = 'py_50stars_2019'" \
  --additional_task_cache_dirs='$BASE_DIR/cache' \
  --gin_param="mesh_eval_dataset_fn.use_cached = True"



# Export


In [None]:
PROJECT="data-analytics-experiments"
ZONE="europe-west4a"
EXPORT_DIR=os.path.join(MODEL_DIR, "export")

In [None]:
!cd code-t5/ && python -m t5.models.mesh_transformer_main \
  --gcp_project="$PROJECT" \
  --tpu_zone="$ZONE" \
  --model_dir="$MODEL_DIR" \
  --module_import="codeT5.tasks" \
  --use_model_api \
  --mode="export_predict" \
  --export_dir="$EXPORT_DIR"

# Predict

In [None]:
%tensorflow_version 2.x
!pip install tensorflow-text
from google.colab import auth
auth.authenticate_user()

In [None]:
import tensorflow as tf
import tensorflow_text  # Required to run exported model.

saved_model_path = os.path.join(EXPORT_DIR, max(tf.io.gfile.listdir(EXPORT_DIR)))

def load_predict_fn(model_path):
  if tf.executing_eagerly():
    print("Loading SavedModel in eager mode.")
    imported = tf.saved_model.load(model_path, ["serve"])
    return lambda x: imported.signatures['serving_default'](tf.constant(x))['outputs'].numpy()
  else:
    print("Loading SavedModel in tf 1.x graph mode.")
    tf.compat.v1.reset_default_graph()
    sess = tf.compat.v1.Session()
    meta_graph_def = tf.compat.v1.saved_model.load(sess, ["serve"], model_path)
    signature_def = meta_graph_def.signature_def["serving_default"]
    print("Input name: " + str(signature_def.inputs))
    return lambda x: sess.run(
        fetches=signature_def.outputs["outputs"].name, 
        feed_dict={signature_def.inputs["inputs"].name: x}
    )

predict_fn = load_predict_fn(saved_model_path)

In [None]:
def answer(question):
  return predict_fn([question])[0].decode('utf-8')

for question in ["password = ",
                  "def __main__():Ċ  ",
                  "import",
                  "a"]:
    print(answer(question))

In [None]:
import tensorflow as tf

tf.debugging.set_log_device_placement(False)
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

In [None]:
!nvidia-smi -L

# Docker image

In [None]:
export MODEL_NAME="py5k_prefix_lm"
export SAVED_MODEL_PATH="${PWD}/mtf-model-export"

sudo systemctl start docker

gsutil cp 'gs://t5-codex/models/large/export/1630574205' $SAVED_MODEL_PATH

# Download the TensorFlow Serving Docker image and repo:
docker pull tensorflow/serving:nightly

# First, run a serving image as a daemon:
docker run -d --name serving_base tensorflow/serving:nightly

# Next, copy the `SavedModel` to the container's model folder:
docker cp $SAVED_MODEL_PATH serving_base:/models/$MODEL_NAME

# Now, commit the container that's serving the model:
docker commit --change "ENV MODEL_NAME $MODEL_NAME" serving_base $MODEL_NAME

# Finally, save the image to a tar file:
docker save $MODEL_NAME -o $MODEL_NAME.tar

# stop `serving_base`:
docker kill serving_base

In [None]:
docker run -t --rm -p 8501:8501 --name "$MODEL_NAME-server" $MODEL_NAME &

curl -d '{"inputs": ["import tensorflow "]}' \
    -X POST "http://localhost:8501/v1/models/$MODEL_NAME:predict"

docker stop "$MODEL_NAME-server"

In [None]:
# 18.04 LTS https://docs.docker.com/engine/install/ubuntu/

!sudo apt-get update
!sudo apt-get install \
    apt-transport-https \
    ca-certificates \
    curl \
    gnupg \
    lsb-release

!curl -fsSL https://download.docker.com/linux/ubuntu/gpg | sudo gpg --dearmor -o /usr/share/keyrings/docker-archive-keyring.gpg
!echo \
  "deb [arch=amd64 signed-by=/usr/share/keyrings/docker-archive-keyring.gpg] https://download.docker.com/linux/ubuntu \
  $(lsb_release -cs) stable" | sudo tee /etc/apt/sources.list.d/docker.list > /dev/null

!sudo apt-get update
!sudo apt-get install docker-ce docker-ce-cli containerd.io

In [None]:
!sudo service docker stop


In [None]:
!sudo docker run hello-world
