In [1]:
from transformers import (
    GPT2TokenizerFast,
    AdamW,
    get_scheduler
)
import torch

from model import GPT2PromptTuningLM


# Training

In [2]:
class Config:
    # Same default parameters as run_clm_no_trainer.py in tranformers
    # https://github.com/huggingface/transformers/blob/master/examples/pytorch/language-modeling/run_clm_no_trainer.py
    num_train_epochs = 3
    weight_decay = 0.01
    learning_rate = 0.01
    lr_scheduler_type = "linear"
    num_warmup_steps = 0
    max_train_steps = num_train_epochs
    
    # Prompt-tuning
    # number of prompt tokens
    n_prompt_tokens = 20
    # If True, soft prompt will be initialized from vocab 
    # Otherwise, you can set `random_range` to initialize by randomization.
    init_from_vocab = True
    # random_range = 0.5
args = Config()

In [3]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# Initialize GPT2LM with soft prompt
model = GPT2PromptTuningLM.from_pretrained(
    "gpt2",
    n_tokens=args.n_prompt_tokens,
    initialize_from_vocab=args.init_from_vocab
)

Initializing soft prompt...


In [4]:
model.soft_prompt.weight

Parameter containing:
tensor([[-0.1101, -0.0393,  0.0331,  ..., -0.1364,  0.0151,  0.0453],
        [ 0.0403, -0.0486,  0.0462,  ...,  0.0861,  0.0025,  0.0432],
        [-0.1275,  0.0479,  0.1841,  ...,  0.0899, -0.1297, -0.0879],
        ...,
        [-0.0762, -0.0821,  0.1313,  ..., -0.0374,  0.2339, -0.0797],
        [-0.0913, -0.0297,  0.1686,  ..., -0.1519,  0.1601,  0.0403],
        [-0.1132, -0.0435,  0.0922,  ..., -0.0442,  0.2205, -0.0069]],
       requires_grad=True)

In [5]:
# Prepare dataset
inputs = tokenizer("Hello, my dog is cute", return_tensors="pt")
print(inputs)

{'input_ids': tensor([[15496,    11,   616,  3290,   318, 13779]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1]])}


In [6]:
# Only update soft prompt'weights for prompt-tuning. ie, all weights in LM are set as `require_grad=False`. 
optimizer_grouped_parameters = [
    {
        "params": [p for n, p in model.named_parameters() if n == "soft_prompt.weight"],
        "weight_decay": args.weight_decay,
    }
]
optimizer = AdamW(optimizer_grouped_parameters, lr=args.learning_rate)
lr_scheduler = get_scheduler(
    name=args.lr_scheduler_type,
    optimizer=optimizer,
    num_warmup_steps=args.num_warmup_steps,
    num_training_steps=args.max_train_steps,
)

In [7]:
model.train()
outputs = model(**inputs, labels=inputs["input_ids"])
loss = outputs.loss
print(f"loss: {loss}")

loss: 5.7707366943359375


In [8]:
loss.backward()
optimizer.step()

In [9]:
model.soft_prompt.weight
# Confirmed the weights were changed! 

Parameter containing:
tensor([[-0.1001, -0.0293,  0.0431,  ..., -0.1463,  0.0151,  0.0553],
        [ 0.0304, -0.0386,  0.0363,  ...,  0.0761,  0.0125,  0.0432],
        [-0.1176,  0.0579,  0.1742,  ...,  0.0800, -0.1226, -0.0779],
        ...,
        [-0.0662, -0.0914,  0.1412,  ..., -0.0275,  0.2239, -0.0797],
        [-0.0814, -0.0395,  0.1586,  ..., -0.1420,  0.1506,  0.0307],
        [-0.1032, -0.0535,  0.0922,  ..., -0.0542,  0.2115,  0.0031]],
       requires_grad=True)

In [10]:
# save the prompt model
save_dir_path = "."
model.save_soft_prompt(save_dir_path)
# Once it's done, `soft_prompt.model` is in the dir

# Inference
In the inference phase, you need to input ids to the model by using `model.forward()` so that you cannot use `model.generate()` attribute. After you get `next_token_logits` as below, you will need additional codes for your decoding method. 

In [11]:
tokenizer = GPT2TokenizerFast.from_pretrained("gpt2")
# Load the model
model = GPT2PromptTuningLM.from_pretrained(
    "gpt2",
    soft_prompt_path="./soft_prompt.model"
)
model.eval()

Set soft prompt! (n_tokens: 20)


GPT2PromptTuningLM(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
       

In [12]:
input_ids = tokenizer.encode('I enjoy walking with my cute dog', return_tensors='pt')
input_ids

tensor([[   40,  2883,  6155,   351,   616, 13779,  3290]])

In [14]:
outputs = model.forward(input_ids=input_ids)
next_token_logits = outputs[0][0, -1, :]
...