In [1]:
import torch
from transformers import GPT2LMHeadModel, GPT2Tokenizer
import time

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cuda


In [8]:
model_name = "gpt2-medium"
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name).to(device)
model.eval()

prompt = "Hello, I am testing"
inputs = tokenizer(prompt, return_tensors="pt").to(device)

max_new_tokens = 10

def generate_and_time(inputs, use_cache):
    torch.cuda.empty_cache()
    start = time.time()
    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            use_cache=use_cache,
        )
    torch.cuda.synchronize()  # ensure GPU computations finish before stopping timer
    end = time.time()
    return tokenizer.decode(outputs[0], skip_special_tokens=True), end - start

# Warmup
_, _ = generate_and_time(inputs, use_cache=True)

gen_text_cache, time_cache = generate_and_time(inputs, use_cache=True)

gen_text_nocache, time_nocache = generate_and_time(inputs, use_cache=False)

print("\n===== With KV Cache =====")
print(gen_text_cache)
print(f"Time taken (with cache): {time_cache:.4f} sec")

print("\n===== Without KV Cache =====")
print(gen_text_nocache)
print(f"Time taken (without cache): {time_nocache:.4f} sec")

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.



===== With KV Cache =====
Hello, I am testing the new version of the game. I have a
Time taken (with cache): 0.2640 sec

===== Without KV Cache =====
Hello, I am testing the new version of the game. I have a
Time taken (without cache): 0.2682 sec
