# Prompt Tuning and Inference
Credit to corolla-johnson and the mkultra library for much of the prompt tuning code.

This should run on Google Colab. Can not guarantee functionality if run locally.


In [None]:
#@title Colab setup
import torch
colab = 'google.colab' in str(get_ipython())
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

if colab:
    !nvidia-smi
    gpu_type = torch.cuda.get_device_name(0)

# Setup for Colab only
if colab:
    !pip install transformers==4.21.0
    !pip install git+https://github.com/corolla-johnson/mkultra.git#egg=mkultra --log PIP_LOG
    !pip install accelerate nvidia-ml-py3
    !pip install gdown
    !pip install datasets
    !pip install tqdm

# If on Colab, mount Google Drive first
if colab:
    from google.colab import drive
    drive.mount('/content/drive')


In [None]:
#@title Load tokenizer
from transformers import GPT2TokenizerFast
import torch
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")

Downloading vocab.json:   0%|          | 0.00/0.99M [00:00<?, ?B/s]

Downloading merges.txt:   0%|          | 0.00/446k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.29M [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

In [None]:

#@title Training parameters

# Use any desired number of token to set the initial value of the soft prompt.
# Make sure its the right number of tokens.
initial_prompt = "A social media reddit comment from a user that expresses and showcases a great and overwhelming feeling of amusement:"

print(f"Initial prompt length: {len(tokenizer.encode(initial_prompt))} tokens")

# Name your soft prompt project.
sp_name = 'amusement'

# What's the name of model you'll be using?
# (This will be added to the project directory and soft prompt name)
model_name = 'gpt2-large'
model_type = 'gpt2'

# Specify the model directory or huggingface name.
model_dir = 'gpt2-large'

# The above model_dir will download GPT2 1.5B from Huggingface as a baseline.

# Specify the path to the text file used for training.
text_path = "amusement.txt"
# You can also use something uploaded to your Google Drive, e.g.
# text_path = "/content/drive/MyDrive/amusement.txt"

# Specify the project directory. It will store the training related files.
project_dir = f"/soft_prompts/{sp_name}-{model_name}/"

# Decide the length of your training blocks in tokens.
block_size = 200

# Checkpoint interval in steps.
checkpoint_interval = 20

# Evaluation interval in steps.
eval_interval = 5

# Number of blocks to use for evaluation.
eval_blocks = 4

# Adafactor hyperparameters
optimizer_params = {
    # Fixed learning rate, consier 1e-4 to 1e-3
    "lr":1e-3,
    
    # 1st momentum
    "beta1": 0.0,

    # 2nd momentum decay schedule (lower is slower)
    "decay_rate": -0.3,

    # Weight decay
    "weight_decay": 1e-6,
    
    # Update scaling, recommend False
    "scale_parameter": False,
    
    # Built-in LR scheduler, recommend False
    "relative_step": False
    }

# LR scheduler parameters
scheduler_params = {
    "num_warmup_steps": 10,
    "num_cycles": 30,
    "num_training_steps": 3000
}

base_acc_steps = 16
acc_doubling_rate = 0
plateau_steps = 200

In [None]:
#@title Load model

from mkultra.tuning import GPT2PromptTuningLM

if 'model' not in globals():
    if model_type == 'gpt2':
        model = GPT2PromptTuningLM.from_pretrained(model_dir).half().to("cuda")
    else:
        raise "Invalid model type"

In [None]:
#@title Initialize project
#@markdown This will load the latest checkpoint if the project directory already exists.

from mkultra.soft_prompt import SoftPrompt
from transformers import Adafactor
import os

filename_for_checkpoint = lambda step: f"{sp_name}-{model_name}-step-{step}.json"
loaded_sp = None
project_files = None

# Look for existing project directory
try:
    os.makedirs(project_dir)
    print(f"Created project directory at {project_dir}")
except FileExistsError:
    print(f"Found project directory at {project_dir}")

# Look for existing checkpoints
project_files = os.listdir(project_dir)
if project_files is not None:
    checkpoint_files = [check_file for check_file in project_files if ('-step-' in check_file) ]

    if len(checkpoint_files) > 0:
        highest_step = max([ int(check_file[check_file.rfind('-step-')+6:-5]) for check_file in checkpoint_files ])
        loaded_sp = SoftPrompt.from_file( os.path.join(project_dir, filename_for_checkpoint(highest_step)) )
        print(f"Loading latest checkpoint: {highest_step}")
    else:
        print("No checkpoints found")

In [None]:
#@title Process dataset
#@markdown This will load an existing set
#@markdown of tokens if present in the project directory.

import json
import math
import os

text_tokenized = None
tokens_path = os.path.join(project_dir,"tokens.json")

# See if we already have a tokens file
try:
    with open(tokens_path, 'r', encoding='utf-8') as file:
        text_tokenized = json.load(file)
        print("Loaded existing tokens.json file")

except FileNotFoundError:
    print("No tokens.json exists, creating it...")

# If not, make one now
if text_tokenized is None:

    with open(text_path, 'r', encoding='utf-8') as file:
        text = file.read()
    text_tokenized = tokenizer.encode(text)
    
    with open(tokens_path, 'x', encoding='utf-8') as file:
        json.dump(text_tokenized, file)

text_length = len(text_tokenized)
num_blocks = math.ceil(text_length/block_size)

print(f"Length of text: {len(text_tokenized)} tokens")
print(f"Number of blocks: {num_blocks}, each {block_size} tokens")

# Partition tokens into blocks
blocks = list()
for block_num in range(num_blocks):
    start = block_num * block_size
    end = min(start + block_size, text_length)
    blocks.append( text_tokenized[start:end] )

block_order_path = os.path.join(project_dir, "block_order.json")

# See if we already have a block_order file
try:
    with open(block_order_path, 'r', encoding='utf-8') as file:
        block_order = json.load(file)
        print("Loaded existing block_order.json file")

except FileNotFoundError:
    print("No block_order.json exists, creating it...")
    block_order = [*range(num_blocks)]

    with open(block_order_path, 'x', encoding='utf-8') as file:
        json.dump(block_order, file)

In [None]:
#@title Initialize soft prompt in model
#@markdown If a checkpoint is present in the project folder; is used.
if loaded_sp is None:
    initial_sp = SoftPrompt.from_string(initial_prompt, model, tokenizer)
    print(f"Initial prompt length: {len(initial_sp)}")
    model.set_soft_prompt(initial_sp)

    sp_step = 0
    eval_loss = 100
else:
    model.set_soft_prompt(loaded_sp)
    sp_step = loaded_sp._metadata['step']
    eval_loss = loaded_sp._metadata['loss']

In [None]:
# Configure number of steps to train for.
# One step is (acc_steps) forward passes.
num_training_steps = scheduler_params['num_training_steps']

In [None]:
from transformers import AdamW, Adafactor
import transformers

# Feed soft params to optimizer
optimizer_params['params'] = [model.get_soft_params()]
optimizer = Adafactor(**optimizer_params)
optimizer.state['step'] = sp_step

scheduler_params['optimizer'] = optimizer
scheduler = transformers.get_cosine_with_hard_restarts_schedule_with_warmup(**scheduler_params)

In [None]:
#@title Train the soft prompt

from tqdm.notebook import tqdm
import random
import torch
import math

torch.cuda.empty_cache()
loss_log_path = os.path.join(project_dir,"loss_log.csv")
bar = tqdm(total=num_training_steps)
optimizer.state['step'] = sp_step
evals_since_last_improvement = 0
best_eval = float('inf')
model.to(device)

# Fix eval order
eval_order = [*range(num_blocks)]
random.seed(1234)
random.shuffle(eval_order)

# Function for gradient accumulation scheduling
def get_acc_steps(sp_step):
    if acc_doubling_rate != 0:
        return round(base_acc_steps * math.pow(2, (sp_step / acc_doubling_rate)))
    else:
        return base_acc_steps

for session_step in range(num_training_steps):
      model.train()

      acc_steps = get_acc_steps(sp_step)

      for i in range(acc_steps):
          idx = (sp_step*acc_steps + i) % num_blocks

          # Shuffle blocks every epoch
          if idx == 0:
              random.shuffle(block_order)
              with open(block_order_path, 'w', encoding='utf-8') as file:
                  json.dump(block_order, file)

          block = blocks[block_order[idx]]

          input_ids = torch.LongTensor(block).unsqueeze(0).to(device).detach()
          
          # Forward pass and optimize
          outputs = model(input_ids=input_ids, labels=input_ids)
          loss = outputs.loss
          loss.backward()

          instant_loss = loss.item()
          if math.isnan(instant_loss):
              torch.cuda.empty_cache()
              raise KeyboardInterrupt

          # Discard tensor that was moved to GPU
          del input_ids
          torch.cuda.empty_cache()

      # Accumulate gradients
      optimizer.step()
      lr = optimizer.param_groups[0]["lr"]
      scheduler.step()
      optimizer.zero_grad()

      if math.isnan(instant_loss):
          torch.cuda.empty_cache()
          raise KeyboardInterrupt

      # Evaluate model and plot loss
      if sp_step%eval_interval == 0:
          model.eval()
          torch.cuda.empty_cache()
          eval_loss = 0

          with torch.no_grad():
              for eval_step in range(eval_blocks):
                  block = blocks[eval_order[eval_step]]
                  input_ids = torch.LongTensor(block).unsqueeze(0).to(device).detach()
                  eval_loss += model(input_ids=input_ids, labels=input_ids).loss.item()
                  
                  # Discard tensor that was moved to GPU
                  del input_ids
                  torch.cuda.empty_cache()

          eval_loss /= eval_blocks

          with open(loss_log_path, 'a', encoding='utf-8') as file:
              file.write(f"{sp_step},{eval_loss}\n")
          
          # Stop if loss has plateaued
          if plateau_steps != 0:
              if eval_loss < best_eval:
                  best_eval = eval_loss
                  evals_since_last_improvement = 0
              else:
                  evals_since_last_improvement += 1
              if evals_since_last_improvement > plateau_steps:
                  print(f"No improvement for {plateau_steps} evals")
                  break

      # Save checkpoint every so often
      if sp_step%checkpoint_interval == 0:
          sp = SoftPrompt.from_tuning_model(model,
              {"name" : sp_name + f"-step-{sp_step}",
               "step"  : sp_step,
               "loss"  : eval_loss})
          sp.to_file( os.path.join( project_dir,filename_for_checkpoint(sp_step) ) )

      bar.set_postfix({
          "Model Step" : sp_step,
          "Eval Loss"  : "{el:.5f}".format(el=eval_loss),
          "Acc Steps"  : acc_steps,
          "LR"         : lr
      })
      bar.update(1)
      sp_step += 1

# Save a checkpoint once done
sp = SoftPrompt.from_tuning_model(model,
    {"name"  : sp_name + f"-step-{sp_step}",
     "step"  : sp_step,
     "loss"  : eval_loss})
sp.to_file( os.path.join( project_dir,filename_for_checkpoint(sp_step) ) )

In [None]:
#@title Flush memory after interrupting training
try:
  del input_ids
except Exception:
  pass
torch.cuda.empty_cache()

In [None]:
#@title Plot evaluation loss
import matplotlib.pyplot as plt
import matplotlib.cbook as cbook
import numpy as np
import os
loss_log_path = os.path.join(project_dir,"loss_log.csv")

fname2 = cbook.get_sample_data(loss_log_path, asfileobj=False)
with cbook.get_sample_data(loss_log_path) as file:
    array = np.loadtxt(file, delimiter=",")

fig = plt.figure()
plt.plot(array[:, 0], array[:, 1])

In [None]:
#@title Generate text w/ a soft prompt
model.eval()

# Restore soft prompt from desired checkpoint, insert its path here.
SOFT_PROMPT_PATH = ""

sp = SoftPrompt.from_file(os.path.join(SOFT_PROMPT_PATH))
model.set_soft_prompt(sp)
model.initialize_soft_prompt(n_tokens=1)


# Set topic prompts
prompts = ["I feel", "This technology", "The President", "Legally"]

# For each prompt, generate number of entries.
for prompt in prompts:
  call = tokenizer(prompt, return_tensors="pt").input_ids.cuda()
  for i in range(0,10):
    basic_output = model.generate(
        input_ids=call,
        do_sample=True,
        min_length=call.shape[-1] + 15,
        max_length=call.shape[-1] + 45,
        temperature=1.0,
        tfs = 0.9,
        repetition_penalty = 3.0,
        pad_token_id=tokenizer.eos_token_id,
        eos_token_id=tokenizer.eos_token_id
    )

    output = tokenizer.decode(basic_output[0])
    print(output)

In [None]:
#@title Generate text w/out a soft prompt

model.eval()

# Purge soft prompt, to generate without it.
model.initialize_soft_prompt(n_tokens=1)

test = "I"

call = tokenizer(test, return_tensors="pt").input_ids.cuda()

basic_output = model.generate(
    input_ids=call,
    do_sample=True,
    min_length=call.shape[-1] + 15,
    max_length=call.shape[-1] + 45,
    temperature=1.0,
    tfs = 0.9,
    repetition_penalty = 3.0,
    pad_token_id=tokenizer.eos_token_id,
    eos_token_id=tokenizer.eos_token_id
)
print(tokenizer.decode(basic_output[0]))