In [11]:
import time
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import torch

def test_gpt2_inference_metrics(model_name="gpt2", prompt="Hello, how are you?", max_tokens=50):
    """
    Test inference metrics for GPT-2 model.

    Args:
        model_name (str): Name of the GPT-2 model to load.
        prompt (str): Input text to feed into the model.
        max_tokens (int): Maximum number of tokens to generate.

    Returns:
        dict: Metrics including latency, TTFT, time per token, and throughput.
    """
    # Load model and tokenizer
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.eval()

    if torch.cuda.is_available():
        model = model.to("cuda")

    # Tokenize input
    input_ids = tokenizer(prompt, return_tensors="pt").input_ids
    if torch.cuda.is_available():
        input_ids = input_ids.to("cuda")

    # Initialize variables
    start_time = time.perf_counter()
    generated_tokens = []

    # Generate tokens and record times
    latency = None
    ttft = None
    token_times = []

    for i in range(max_tokens):
        # Measure time for the first token
        if i == 0:
            token_start_time = time.perf_counter()
            output = model.generate(input_ids, max_new_tokens=1, do_sample=False)
            token_end_time = time.perf_counter()
            if latency is None:
                latency = token_end_time - start_time
                ttft = token_end_time - token_start_time
        else:
            # Generate one token at a time
            token_start_time = time.perf_counter()
            output = model.generate(input_ids, max_new_tokens=1, do_sample=False)
            token_end_time = time.perf_counter()

        # Record time per token
        token_times.append(token_end_time - token_start_time)

        # Update input_ids with newly generated token
        input_ids = torch.cat((input_ids, output[:, -1:]), dim=1)
        generated_tokens.append(output[:, -1:].item())

    # Final time
    end_time = time.perf_counter()

    # Metrics calculation
    total_time = end_time - start_time
    time_per_token = sum(token_times) / len(token_times)
    throughput = len(generated_tokens) / total_time

    return {
        "latency": latency,
        "ttft": ttft,
        "time_per_token": time_per_token,
        "throughput": throughput,
    }

# Example usage
if __name__ == "__main__":
    metrics = test_gpt2_inference_metrics(prompt="Once upon a time, in a faraway land,", max_tokens=10)
    print(metrics)


The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
The attention mask and the pad token id were not set. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generati

{'latency': 0.005711685982532799, 'ttft': 0.005710346973501146, 'time_per_token': 0.003279856679728255, 'throughput': 302.61582927922353}


In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel
import random
import torch
import numpy as np

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed) 

def generate_code_fix(prompt, model_name="gpt2-xl", max_tokens=20, seed=42):
    """
    Generate the next tokens using GPT-2 XL given a code prompt.

    Args:
        prompt (str): The input prompt to provide to the model.
        model_name (str): The name of the GPT-2 model to use (default is "gpt2-xl").
        max_tokens (int): The number of tokens to generate (default is 100).

    Returns:
        str: The generated text from the model.
    """
    set_seed(seed)
    # Load the tokenizer and model
    tokenizer = GPT2Tokenizer.from_pretrained(model_name)
    model = GPT2LMHeadModel.from_pretrained(model_name)
    model.eval()

    # Set pad_token if not already set
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token

    # Move model to GPU if available
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    model.to(device)

    # Tokenize the input prompt and create the attention mask
    inputs = tokenizer(prompt, return_tensors="pt", padding=True, truncation=True)
    input_ids = inputs["input_ids"].to(device)
    attention_mask = inputs["attention_mask"].to(device)  # Explicit attention mask

    # Generate the next tokens
    output_ids = model.generate(
        input_ids,
        attention_mask=attention_mask,  # Pass attention mask
        max_new_tokens=len(input_ids[0]) + max_tokens,  # Maximum number of tokens to generate
        # do_sample = True,
        pad_token_id=tokenizer.pad_token_id  # Handle padding explicitly
    )

    # Decode the generated tokens to text
    generated_text = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    return generated_text

# code = """def bitcount(n):
#     count = 0
#     while n:
#         n ^= n - 1
#         count += 1
#     return count"""

# prompt = f"Given the following code is incorrect:\n{code}\nCorrected code:"
# result = generate_code_fix(prompt, seed=56)
# print("Generated Text:")
# print(result)

base = 'Dadaji lost his mother and took to living with his maternal uncle Narayan Dhurmaji .'
phrase = 'In the meanwhile ,'
prompt = f"Insert this phrase of '{phrase}' in to this sentence of '{base}'. The inserted sentence is:"
result = generate_code_fix(prompt, seed=56)
print("Generated Text:")
print(result)


Generated Text:
Insert this phrase of 'In the meanwhile,' in to this sentence of 'Dadaji lost his mother and took to living with his maternal uncle Narayan Dhurmaji.'. The inserted sentence is: 'In the meanwhile, Narayan Dhurmaji lost his mother and took to living with his maternal uncle Narayan Dhurmaji.'

The word 'in' is not a part of the sentence. It is inserted by the writer.

The word 'in' is not a


In [26]:
code = """def bitcount(n):
    count = 0
    while n:
        n ^= n - 1
        count += 1
    return count"""

prompt = f"Given this bugged code: \n {code}. \n Fix the code above: "
result = generate_code_fix(prompt)
print("Generated Text:")
print(result)

Generated Text:
Given this bugged code: 
 def bitcount(n):
    count = 0
    while n:
        n ^= n - 1
        count += 1
    return count. 
 Fix the code above: 
def bitcount(n):
      n = int(n)
      n ^= n - 1
      count = 0
      while n:
      n ^= n - 1
      count += 1
      return count. 
The first line of the above program prints "0" to the console. The second line prints "1" to the console. What the program doesn't know is


In [7]:
doc = """An American woman died aboard a cruise ship that docked at Rio de Janeiro on Tuesday, the same ship on which 86 passengers previously fell ill, according to the state-run Brazilian news agency, Agencia Brasil. The American tourist died aboard the MS Veendam, owned by cruise operator Holland America. Federal Police told Agencia Brasil that forensic doctors were investigating her death. The ship's doctors told police that the woman was elderly and suffered from diabetes and hypertension, according the agency. The other passengers came down with diarrhea prior to her death during an earlier part of the trip, the ship's doctors said. The Veendam left New York 36 days ago for a South America tour"""

prompt = f"Extract sentences as summary: \n {doc}. \nExtractive summarization: "

result = generate_code_fix(prompt)
print("Generated Text:")
print(result)

Generated Text:
Extract sentences as summary: 
 An American woman died aboard a cruise ship that docked at Rio de Janeiro on Tuesday, the same ship on which 86 passengers previously fell ill, according to the state-run Brazilian news agency, Agencia Brasil. The American tourist died aboard the MS Veendam, owned by cruise operator Holland America. Federal Police told Agencia Brasil that forensic doctors were investigating her death. The ship's doctors told police that the woman was elderly and suffered from diabetes and hypertension, according the agency. The other passengers came down with diarrhea prior to her death during an earlier part of the trip, the ship's doctors said. The Veendam left New York 36 days ago for a South America tour. 
Extractive summarization: 
The Veendam, a cruise ship owned by Holland America, docked at Rio de Janeiro on Tuesday. The ship's doctors told police that the woman was elderly and suffered from diabetes and hypertension, according to Agencia Brasil. 