This notebook uses [GPTNeo](https://github.com/EleutherAI/GPTNeo) by [EleutherAI](eleuther.ai) to fine tune the model and predict a batch of instances.

# Product Description Generation

If a new batch is being generated: 

1. Make sure you have prepared the dataset with the "prepare" notebook

2. Make sure the fine tuned model is uploaded to the bucket


Choose the following options:
1. re-initialize this configuration [1]
2. the google account with the cloud storage [1]
3. gpt project [10]
4. No [n]

In [None]:
from google.colab import auth
auth.authenticate_user()
#!gcloud auth login
!gcloud init

Mount the drive with the excel files where also the generated descriptions will be stored.


In [None]:
# Mount drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import os
%tensorflow_version 2.x
!git clone https://github.com/EleutherAI/gpt-neo
%cd gpt-neo
!pip3 install -q -r requirements.txt
pretrained_model = None
dataset = None


In [None]:
!pip install -U tensorflow-gcs-config==2.1.3
!pip install -q t5 tensorflow-text==2.3

In [None]:
path_to_cloud_bucket = 'gs://test-gpt-j/' 

# Configs
dataset configs

In [None]:
%%writefile configs/dataset_configs/prod_desc_gpt_j.json

{
  "path": "gs://test-gpt-j/datasets/prod_desc_gpt_j_*.tfrecords",
  "eval_path": "",
  "n_vocab": 50256,
  "tokenizer_is_pretrained": true,
  "tokenizer_path": "gpt2",
  "eos_id": 50256,
  "padding_id": 50257
}


Model configs

In [None]:
%%writefile configs/GPT3_XL.json

{
    "n_head": 16,
    "n_vocab": 50257,
    "embed_dropout": 0,
    "lr": 0.0002,
    "lr_decay": "cosine",
    "warmup_steps": 3000,
    "beta1": 0.9,
    "beta2": 0.95,
    "epsilon": 1e-8,
    "opt_name": "adam",
    "weight_decay": 0,
    "train_batch_size": 256,
    "attn_dropout": 0,
    "train_steps": 600000,
    "eval_steps": 0,
    "predict_steps": 1,
    "res_dropout": 0,
    "eval_batch_size": 4,
    "predict_batch_size": 1,
    "iterations": 100,
    "n_embd": 2048,
    "datasets": [["prod_desc_gpt_j", null, null, null]],
    "model": "GPT",
    "model_path": "gs://test-gpt-j/",
    "n_ctx": 2048,
    "n_layer": 24,
    "scale_by_depth": true,
    "scale_by_in": false,
    "attention_types" :  [[["global", "local"],12]],
    "mesh_shape": "x:4,y:2",
    "layout": "intermediate_expanded:x,heads:x,vocab:n_vocab,memory_length:y,embd:y",
    "activation_function": "gelu",
    "recompute_grad": true,
    "gradient_clipping": 1.0,
    "tokens_per_mb_per_replica": 2048,
    "precision": "bfloat16"
}

#Fine tuned model

In [None]:
bucket_base = "gs://" + path_to_cloud_bucket.replace('gs://', '').split('/')[0]
pretrained_model = 'GPT3_XL'
!mkdir pretrained
!gsutil -m cp gs://test-gpt-j/GPT3_XL/config.json pretrained
path_to_local_weights = f"/content/gpt-neo/pretrained/"

In [None]:
import json
from pprint import pprint

path_to_model = "" 
batch_size = 8 
dset = "prod_desc_gpt_j"  
mesh_shape = "x:4,y:2"
train_steps = 1000 
steps_per_checkpoint = 500 
start_step = 400000 if pretrained_model == "GPT3_2-7B" else 362000

if path_to_model == "":
  path_to_model = f'{bucket_base.strip("/")}/{pretrained_model}'
print(f'MODEL PATH: {path_to_model}\n')

if dset == "" and dataset != "Sampling_Only":
  dset = dataset
elif dataset is None and dset == "":
  dset = "pile"

def pad_to_multiple_of(n, mult):
  """
  pads n to a multiple of mult
  """
  extra = n % mult
  if extra > 0:
      n = n + mult - extra
  return n

with open(f'{path_to_local_weights}config.json', 'r') as f:
  data = json.load(f)
  pprint(data)
  dset_val = [[dset, None, None, None]] if dset != "" else data["datasets"]
  mods = {
          "mesh_shape": mesh_shape,
          "layout": "intermediate_expanded:x,heads:x,memory_length:y,embd:y",
          "model_path": path_to_model,
          "datasets": dset_val,
          "train_steps": start_step + train_steps,
          "eval_steps": 0,
          "train_batch_size": batch_size,
          "predict_batch_size": batch_size
        }
  data.update(mods)
  print('\n--->\n')
  pprint(data)
  with open(f'configs/{pretrained_model}.json', 'w') as outfile:
    json.dump(data, outfile, indent=2)

### Sample from your model

Once the pretrained model (fine tuned) is in the bucket, sample from it.

In [None]:
%cd ..
!mkdir drive/MyDrive/dataset/gen/
%cd gpt-neo

Copy the test set to gpt-neo/test/

In [None]:
from data.encoders import encode
from functools import partial
import mesh_tensorflow as mtf
import tensorflow.compat.v1 as tf
from tensorflow.python.tpu import tpu_config, tpu_estimator
from tensorflow_estimator.python.estimator import estimator as estimator_lib
from utils import save_config, expand_attention_types_params, yes_or_no, remove_gs_or_filepath, setup_logging, \
    check_dataset
from inputs import sequential_input, mlm_sample_text, generic_text
from export import export_model
from model_fns import model_fn
from data.encoders import fetch_encoder
from configs import fetch_model_params
from tasks import task_descriptors
import argparse
import json
import numpy as np
import gc
import sys

In [None]:
def pred_input(params, enc=None,
               path_to_prompt=""):
    unicorns = "In a shocking finding, scientists discovered a herd of unicorns living in a remote, " \
               "previously unexplored valley, in the Andes Mountains. Even more surprising to the " \
               "researchers was the fact that the unicorns spoke perfect English."

    text = unicorns if path_to_prompt == "" else open(path_to_prompt, "r").read()
    tokens = encode(enc, text)

    if len(tokens) > params["n_ctx"]:
        tokens = tokens[len(tokens) - params["n_ctx"]:]
    if len(tokens) < params["n_ctx"]:
        tokens = tf.pad(tokens, [[0, params["n_ctx"] - len(tokens)]], constant_values=params["padding_id"])

    t = tf.broadcast_to(tokens, [params["batch_size"], params["n_ctx"]])
    dataset = tf.data.Dataset.from_tensors(t)

    def _dummy_labels(x):
        return x, x
        
    del t
    del tokens
    gc.collect()
    return dataset

In [None]:
def handle_pred_output(predictions, enc, params, out_name="test"):
    with tf.gfile.Open(out_name, "w") as f:
        for i, p in enumerate(predictions):
            p = p["outputs"]
            # remove eos + padding ids from output
            idx = np.argmax(p == params['eos_id'])
            if idx > 0:
                p = p[:idx]
            idx = np.argmax(p == params['padding_id'])
            if idx > 0:
                p = p[:idx]
            text = enc.decode(p)
            f.write(text)
            #only using the first prediction
            break

    return 


In [None]:
def infer(path,name):
    tf.disable_v2_behavior()

    tpu= "colab"
    model= pretrained_model 
    steps_per_checkpoint = 500 

    # Read params of model
    params = fetch_model_params(model)

    # Fetch appropriate input functions
    input_fn = params.get("input_fn", "sequential_input")
    if input_fn == "sequential_input":
        input_fn = sequential_input
    elif input_fn == "generic_text":
        input_fn = generic_text
    pred_input_fn = pred_input
    handle_pred_output_fn = handle_pred_output

    # get current step
    current_step = int(estimator_lib._load_global_step_from_checkpoint_dir(params["model_path"]))
    
    if params["mlm_training"]:
        mlm_sample_text_fn = partial(mlm_sample_text, params)
        input_fn = partial(generic_text, sample_text_fn=mlm_sample_text_fn)
        if args.check_dataset:
            check_dataset(input_fn, params)


    # Fetch encoder per params
    encoder = fetch_encoder(params)

    pred_input_fn = partial(pred_input_fn, path_to_prompt=path, enc=encoder)

    # Save config to logdir for experiment management
    save_config(params, params["model_path"])

    # Add to params: auto_layout, auto_layout_and_mesh_shape, use_tpu, num_cores
    mesh_shape = mtf.convert_to_shape(params["mesh_shape"])
    params["num_cores"] = mesh_shape.size
    params["auto_layout"] = True
    params["auto_layout_and_mesh_shape"] = True
    params["use_tpu"] = True 
    params["gpu_ids"] = None
    params["steps_per_checkpoint"] = steps_per_checkpoint
    # Expand attention types param
    params["attention_types"] = expand_attention_types_params(params["attention_types"])
    assert len(params["attention_types"]) == params["n_layer"]  # Assert that the length of expanded list = num layers
    params["predict_batch_size"] = params.get("predict_batch_size", 1)  # Default to 1
    params["predict"] = True
    params['model'] = params.get("model", "GPT") # Default model selection to GPT since it's the only option for now
    params["export"] = False
    # Set sampling parameters
    params["sampling_use_entmax"] = False

    # Sample quality of MoE models suffers when using the faster sampling method, so default to slow_sampling if
    # moe layers are present
    params["slow_sampling"] = True if params["moe_layers"] is not None else False

    #logger.info(f"params = {params}")

    # Get eval tasks from params
    eval_tasks = params.get("eval_tasks", [])
    has_predict_or_eval_steps_or_eval_tasks = params["predict_steps"] > 0 or params["eval_steps"] > 0 or len(
        eval_tasks) > 0

    for t in eval_tasks:
        assert t in task_descriptors, f"Eval task '{t}' is not known"
        task_descriptors[t]["init_fn"](params)

    # Set up TPUs and Estimator
    tpu_cluster_resolver = tf.distribute.cluster_resolver.TPUClusterResolver() if params["use_tpu"] else None
    
    config = tpu_config.RunConfig(
        cluster=tpu_cluster_resolver,
        model_dir=params["model_path"],
        save_checkpoints_steps=None,  # Disable the default saver
        save_checkpoints_secs=None,  # Disable the default saver
        log_step_count_steps=params["iterations"],
        save_summary_steps=params["iterations"],
        tpu_config=tpu_config.TPUConfig(
            num_shards=mesh_shape.size,
            iterations_per_loop=params["iterations"],
            num_cores_per_replica=1,
            per_host_input_for_training=tpu_config.InputPipelineConfig.BROADCAST))

    estimator = tpu_estimator.TPUEstimator(
        use_tpu=params["use_tpu"],
        model_fn=model_fn,
        config=config,
        train_batch_size=params["train_batch_size"],
        eval_batch_size=params["train_batch_size"],
        predict_batch_size=params["predict_batch_size"],
        params=params)

    def _make_task_estimator(task):
        task_params = params.copy()
        task_params["eval_task"] = task
        return tpu_estimator.TPUEstimator(
            use_tpu=params["use_tpu"],
            model_fn=model_fn,
            config=config,
            train_batch_size=params["train_batch_size"],
            eval_batch_size=params["eval_batch_size"],
            predict_batch_size=params["predict_batch_size"],
            params=task_params)

    predictions = estimator.predict(input_fn=pred_input_fn)

    #logger.info("Predictions generated")
    enc = fetch_encoder(params)
    out = "/content/drive/MyDrive/dataset/gen/"+name
    handle_pred_output(predictions, enc, params, out_name=out)

    del predictions
    del estimator
    del enc
    del current_step
    del mesh_shape
    gc.collect()
    tf.keras.backend.clear_session()
    tf.reset_default_graph()

    return


In [None]:
def infer_all(dir):
  to_be_gen = []
  generated = []
  with open("/content/drive/MyDrive/dataset/checkpoint.txt","r") as f:
    generated = f.read().split('\n')
  for path in os.listdir(dir):
    full_path = os.path.join(dir, path)
    if os.path.isfile(full_path):
      if path not in generated:
        to_be_gen.append(path)
      
  c=0
  for path in to_be_gen:
    full_path = dir + path
    infer(full_path,path)
    with open("/content/drive/MyDrive/dataset/checkpoint.txt","a") as f:
      f.write(f"{path}\n")
    c+=1
  return

In [None]:
import time
start = time.time()
infer_all("/content/drive/MyDrive/dataset/test/")
print(f"All done in {time.time()-start}s")

##Warning
The results will be deleted from the drive upon running the model on another dataset.