# 02: Text Generation with Decoder-Only LLM
This notebook demonstrates how to load a trained GPT-style model and generate text using temperature, top-k, and top-p sampling.

In [None]:
# Install dependencies
!pip install torch transformers

In [1]:
import sys
import os
sys.path.append(os.path.abspath(".."))

In [3]:
import torch
from transformers import AutoTokenizer
from models.decoder_only import GPTStyleDecoder
from models.utils import sample_logits
from models.rotary_embeddings import build_rope_cache

## Load tokenizer and model

In [4]:
tokenizer = AutoTokenizer.from_pretrained("gpt2")
device = "cuda" if torch.cuda.is_available() else "cpu"

model = GPTStyleDecoder(
    vocab_size=tokenizer.vocab_size,
    embed_dim=768,
    depth=6,
    heads=12,
    ff_dim=2048,
    max_len=128
).to(device)
model.load_state_dict(torch.load("gpt_decoder_trained.pt", map_location=device))
model.eval()

GPTStyleDecoder(
  (token_embed): Embedding(50257, 768)
  (pos_embed): GPTPositionalEncoding()
  (blocks): ModuleList(
    (0-5): 6 x DecoderBlock(
      (self_attn): CausalSelfAttention(
        (attn): MultiheadAttention(
          (out_proj): NonDynamicallyQuantizableLinear(in_features=768, out_features=768, bias=True)
        )
        (dropout): Dropout(p=0.1, inplace=False)
        (ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      )
      (ff): Sequential(
        (0): Linear(in_features=768, out_features=2048, bias=True)
        (1): GELU(approximate='none')
        (2): Linear(in_features=2048, out_features=768, bias=True)
      )
      (ln): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
  )
  (norm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

## Define generation function

In [5]:
def generate(model, tokenizer, prompt, max_new_tokens=50, temperature=1.0, top_k=50, top_p=0.95):
    input_ids = tokenizer(prompt, return_tensors="pt")["input_ids"].to(device)
    generated = input_ids.clone()
    for _ in range(max_new_tokens):
        with torch.no_grad():
            logits = model(generated)
            #sin, cos = build_rope_cache(generated.size(1), model.embed_dim // model.num_heads, device)
            #logits = model(generated, sin=sin, cos=cos)
            next_token_logits = logits[:, -1, :]
            next_token = sample_logits(next_token_logits, temperature, top_k, top_p)
            next_token = next_token.unsqueeze(1)
            generated = torch.cat((generated, next_token), dim=1)
    return tokenizer.decode(generated[0], skip_special_tokens=True)

## Try different prompts

In [6]:
prompt = "Once upon a time"
output = generate(model, tokenizer, prompt, max_new_tokens=60, temperature=0.9, top_k=50, top_p=0.95)
print(f"Prompt: {prompt}\n\nGenerated:\n{output}")

Prompt: Once upon a time

Generated:
Once upon a time of a word.
O bloody-dises, a bawd!
O God! O death! a son, father!

FRIAR LAURENCE:
I know not, when it is;
Hence is my child and so long'd
Shall,
