<a href="https://colab.research.google.com/github/mgp87/Jupyter_Notebooks_Collection/blob/main/GPTFast.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install -qqq gptfast==0.2.1 --progress-bar off
!pip install -U -qqq numpy==1.26.3 --progress-bar off

In [None]:
import os

import torch
from GPTFast.Core import gpt_fast
from GPTFast.Helpers import timed
from transformers import AutoTokenizer

torch._dynamo.reset()
os.environ["TOKENIZERS_PARALLELISM"] = "false"

device = "cuda" if torch.cuda.is_available() else "cpu"


def argmax_variation(
    self, probabilities: torch.Tensor, temperature: float = 1, k: int = 5
):
    # Apply temperature scaling
    device = probabilities.device
    scaled_probabilities = probabilities / temperature

    # Ensure k is within a valid range
    k = min(k, probabilities.size(-1))

    # Get the indices of the top-k scaled probabilities along the specified dimension
    top_k_indices = torch.topk(scaled_probabilities, k, dim=-1).indices

    # Generate random indices for sampling
    random_indices = torch.randint(0, k, (1,) * probabilities.dim()).to(device)

    # Use gathered indices to get the final sampled token
    sampled_token = top_k_indices.gather(-1, random_indices).to(device)

    return sampled_token.unsqueeze(0)


def argmax(self, probabilities):
    # Use argmax to get the token with the maximum probability
    max_prob_index = torch.argmax(probabilities, dim=-1)
    return max_prob_index.view(1, 1)


model_name = "gpt2-xl"
draft_model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)
initial_string = "Write me a short story."
input_tokens = tokenizer.encode(initial_string, return_tensors="pt").to(device)

N_ITERS = 10
MAX_TOKENS = 50

cache_config = {
    "model_config": {
        "path_to_blocks": ["transformer", "h"],
        "child_ref_in_parent_forward": ["transformer", "block"],
    },
    "block_config": {
        "path_to_attn": ["attn"],
        "child_ref_in_parent_forward": ["attn"],
    },
    "attn_config": {
        "cache_update_config": {
            "kv_cache_condition": "if layer_past is not None",
            "key_name": "key",
            "value_name": "value",
        },
        "causal_mask_config": {
            "causal_mask_application": "conditional",
            "causal_mask_method": "_attn",
            "causal_mask_condition": "not self.is_cross_attention",
        },
    },
    "imports": [
        "import torch",
        "import transformers",
        "from transformers import *",
        "from torch import *",
        "from typing import *",
        "import types",
        "from transformers.modeling_outputs import *",
        "from torch import nn",
    ],
}

gpt_fast_model = gpt_fast(
    model_name,
    sample_function=argmax,
    max_length=60,
    cache_config=cache_config,
    draft_model_name=draft_model_name,
)
gpt_fast_model.to(device)

fast_compile_times = []
for i in range(N_ITERS):
    with torch.no_grad():
        res, compile_time = timed(
            lambda: gpt_fast_model.generate(
                cur_tokens=input_tokens, max_tokens=MAX_TOKENS, speculate_k=6
            )
        )
    fast_compile_times.append(compile_time)
    print(f"gpt fast eval time {i}: {compile_time}")
print("~" * 10)

print(tokenizer.decode(res[0]))

loading configuration file config.json from cache at /root/.cache/huggingface/hub/models--gpt2-xl/snapshots/15ea56dee5df4983c59b2538573817e1667135e2/config.json
Model config GPT2Config {
  "_name_or_path": "gpt2-xl",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1600,
  "n_head": 25,
  "n_inner": null,
  "n_layer": 48,
  "n_positions": 1024,
  "output_past": true,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
 

gpt fast eval time 0: 2.40134716796875
gpt fast eval time 1: 2.846116943359375
gpt fast eval time 2: 2.33012841796875
gpt fast eval time 3: 2.32789990234375
gpt fast eval time 4: 2.350912841796875
gpt fast eval time 5: 2.40123291015625
gpt fast eval time 6: 2.797267333984375
gpt fast eval time 7: 2.3549501953125
gpt fast eval time 8: 2.338822265625
gpt fast eval time 9: 2.33497216796875
~~~~~~~~~~
Write me a short story.

I'm not sure if you've heard of it, but there's a story about a guy who's been living in a house for a while and he's been having a hard time finding a job. He's been living in a house
