<a href="https://colab.research.google.com/github/nrghosh/ML-Projects/blob/master/specdec.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install torch transformers


Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (23.7 MB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (823 kB)
Collecting nvidia-cuda-cupti-cu12==12.1.105 (from torch)
  Using cached nvidia_cuda_cupti_cu12-12.1.105-py3-none-manylinux1_x86_64.whl (14.1 MB)
Collecting nvidia-cudnn-cu12==8.9.2.26 (from torch)
  Using cached nvidia_cudnn_cu12-8.9.2.26-py3-none-manylinux1_x86_64.whl (731.7 MB)
Collecting nvidia-cublas-cu12==12.1.3.1 (from torch)
  Using cached nvidia_cublas_cu12-12.1.3.1-py3-none-manylinux1_x86_64.whl (410.6 MB)
Collecting nvidia-cufft-cu12==11.0.2.54 (from torch)
  Using cached nvidia_cufft_cu12-11.0.2.54-py3-none-manylinux1_x86_64.whl (121.6 MB)
Collecting nvidia-curand-cu12==10.3.2.106 (from torch)
  Using cached nvidia_curand_cu12-10.3.2.106-py3-none-manylinux1_x86_64.whl (56.5 MB)
Collectin

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load model and tokenizer
model_name = "gpt2"  # replace with 'Llama3' when available
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [13]:
def draft_model_decode(model, tokenizer, input_text, num_tokens=5, num_drafts=3):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    pad_token_id = tokenizer.eos_token_id
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        draft_outputs = []
        for _ in range(num_drafts):
            outputs = model.generate(
                input_ids,
                max_length=input_ids.shape[1] + num_tokens,
                do_sample=True,
                pad_token_id=pad_token_id,
                attention_mask=attention_mask
            )
            draft_outputs.append(outputs[0, input_ids.shape[1]:])

    # Aggregate drafts and choose the most likely continuation
    final_tokens = torch.mode(torch.stack(draft_outputs), dim=0)[0]
    return tokenizer.decode(final_tokens, skip_special_tokens=True)


In [14]:
def medusa_head_decode(model, tokenizer, input_text, num_tokens=5, num_heads=3):
    input_ids = tokenizer.encode(input_text, return_tensors='pt')
    pad_token_id = tokenizer.eos_token_id
    attention_mask = torch.ones_like(input_ids)

    with torch.no_grad():
        head_outputs = []
        for _ in range(num_heads):
            outputs = model.generate(
                input_ids,
                max_length=input_ids.shape[1] + num_tokens,
                do_sample=True,
                pad_token_id=pad_token_id,
                attention_mask=attention_mask
            )
            head_outputs.append(outputs[0, input_ids.shape[1]:])

    # Aggregate heads and choose the most likely continuation
    final_tokens = torch.mode(torch.stack(head_outputs), dim=0)[0]
    return tokenizer.decode(final_tokens, skip_special_tokens=True)


In [16]:
import time

def benchmark(model, tokenizer, input_text, decoding_fn, num_tokens=5, num_runs=10):
    ttft_list = []
    throughput_list = []

    for _ in range(num_runs):
        # Measure time-to-first-token (TTFT)
        input_ids = tokenizer.encode(input_text, return_tensors='pt')
        pad_token_id = tokenizer.eos_token_id
        attention_mask = torch.ones_like(input_ids)

        start_time = time.time()
        with torch.no_grad():
            outputs = model.generate(
                input_ids,
                max_length=input_ids.shape[1] + 1,  # Generate the first token only
                do_sample=True,
                pad_token_id=pad_token_id,
                attention_mask=attention_mask
            )
        ttft = time.time() - start_time
        ttft_list.append(ttft)

        # Measure throughput
        start_time = time.time()
        output_text = decoding_fn(model, tokenizer, input_text, num_tokens)
        total_time = time.time() - start_time
        throughput = num_tokens / total_time
        throughput_list.append(throughput)

    avg_ttft = sum(ttft_list) / num_runs
    avg_throughput = sum(throughput_list) / num_runs

    return {
        "avg_ttft": avg_ttft,
        "avg_throughput": avg_throughput
    }

# Example usage
input_text = "Once upon a time"
num_tokens = 10

print("Benchmarking Draft Model Approach")
draft_benchmark = benchmark(model, tokenizer, input_text, draft_model_decode, num_tokens)
print(draft_benchmark)

print("Benchmarking Medusa Head Approach")
medusa_benchmark = benchmark(model, tokenizer, input_text, medusa_head_decode, num_tokens)
print(medusa_benchmark)


Benchmarking Draft Model Approach
{'avg_ttft': 0.14508390426635742, 'avg_throughput': 4.9944237963127405}
Benchmarking Medusa Head Approach
{'avg_ttft': 0.10131804943084717, 'avg_throughput': 5.330000156787532}
