# 07: Compare RoPE and KV Cache in Decoder-Only Models
This notebook demonstrates the performance and output differences between:
- ✅ No RoPE, No KV Cache
- ✅ RoPE Only
- ✅ KV Cache Only
- ✅ Both RoPE + KV Cache

We will use a fixed prompt and measure:
- Generation time
- Token quality (manually)
- Optionally perplexity (future)

In [None]:
!pip install torch transformers

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [2]:
import torch
import time
from transformers import AutoTokenizer
from models.decoder_only import GPTStyleDecoder
from models.rotary_embeddings import build_rope_cache, RotaryEmbedding
from models.kv_cache import KVCache

## Load tokenizer and define prompt

In [3]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.add_special_tokens({'pad_token': '[PAD]'})
prompt = "The quick brown fox"
input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"]
device = "cuda" if torch.cuda.is_available() else "cpu"

## Define generation function (with options for RoPE and KV)

In [4]:
def generate_with_options(model, input_ids, max_new_tokens=32, use_rope=False, use_kv=False):
    model.eval()
    input_ids = input_ids.to(device)
    generated = input_ids.clone()

    sin, cos = None, None
    if use_rope:
        sin, cos = build_rope_cache(max_new_tokens + input_ids.shape[1], model.rope_dims(), device)

    cache = None
    if use_kv:
        cache = KVCache(max_batch_size=1, max_seq_len=max_new_tokens + input_ids.shape[1], num_heads=model.num_heads, head_dim=model.embed_dim // model.num_heads, device=device)

    with torch.no_grad():
        for _ in range(max_new_tokens):
            logits = model(generated, sin=sin, cos=cos, kv_cache=cache) if (use_rope or use_kv) else model(generated)
            next_token = torch.argmax(logits[:, -1, :], dim=-1, keepdim=True)
            generated = torch.cat((generated, next_token), dim=1)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

## Run comparisons

In [9]:
results = {}
configs = [
    (False, False),
    (True, False),
    (False, True),
    (True, True)
]

for use_rope, use_kv in configs:
    name = f"ROPE={'✅' if use_rope else '❌'}, KV={'✅' if use_kv else '❌'}"
    model = GPTStyleDecoder(
                            vocab_size=tokenizer.vocab_size,
                            embed_dim=768,
                            depth=6,
                            heads=12,
                            ff_dim=2048,
                            max_len=128
                            ).to(device)
    if use_rope:
        model.load_state_dict(torch.load("gpt_decoder_rope_trained.pt", map_location=device))
    else:
        model.load_state_dict(torch.load("gpt_decoder_trained.pt", map_location=device))
    model.eval()
    start = time.time()
    output = generate_with_options(model, input_ids, use_rope=use_rope, use_kv=use_kv)
    elapsed = time.time() - start
    results[name] = (output, elapsed)
    #print(f"{name} → time: {elapsed:.3f}s\n\n{output}\n{'-'*80}")
    print(f"{name} → time: {elapsed:.3f}s\n{'-'*80}")

ROPE=❌, KV=❌ → time: 1.019s
--------------------------------------------------------------------------------
ROPE=✅, KV=❌ → time: 0.986s
--------------------------------------------------------------------------------
ROPE=❌, KV=✅ → time: 0.986s
--------------------------------------------------------------------------------
ROPE=✅, KV=✅ → time: 0.962s
--------------------------------------------------------------------------------
