In [1]:
import os
# os.environ["CUDA_VISIBLE_DEVICES"] = "3"  # Define GPU id, remove if you want to use all GPUs available
import torch
from tqdm import tqdm
import time
from contextlib import contextmanager
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

@contextmanager
def timed(wall_times, key):
    start = time.time()
    torch.cuda.synchronize()
    yield
    torch.cuda.synchronize()
    end = time.time()
    elapsed_time = end - start
    wall_times[key].append(elapsed_time)

def forward(input_ids, model, tokenizer, max_steps=512):
    wall_times = {'init': []}

    print("Initializing...")  # Debug statement
    with timed(wall_times, 'init'):
        input_len = input_ids.shape[1]
    
    new_token = 0
    output_ids = input_ids

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)

        if tokenizer.eos_token_id in output_ids[0, input_len:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, new_token, idx, wall_times

# Load the base Vicuna model
model_name = 'lmsys/vicuna-7b-v1.3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the prompt
temperature = 0.0
posterior_threshold = 0.09
posterior_alpha = 0.3
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Hi, could you share a tale about a charming llama that grows Medusa-like hair and starts its own coffee shop? ASSISTANT:"

# After loading the model
device = model.device  # Get the device of the model

# Ensure input_ids is on the same device as the model
input_ids = tokenizer([prompt]).input_ids
input_tensor = torch.as_tensor(input_ids).to(device)  # Move to the correct device

with torch.inference_mode():
    output_ids, new_token, idx, wall_time = forward(
        input_tensor,
        model,
        tokenizer,
    )

    output_ids = output_ids[0][len(input_ids[0]):]
    print("Output length:", output_ids.size(-1))
    print("Tokens/second:", idx / wall_time['init'][0])


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.12s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Initializing...
Step 1/512...
Step 2/512...
Step 3/512...
Step 4/512...
Step 5/512...
Step 6/512...
Step 7/512...
Step 8/512...
Step 9/512...
Step 10/512...
Step 11/512...
Step 12/512...
Step 13/512...
Step 14/512...
Step 15/512...
Step 16/512...
Step 17/512...
Step 18/512...
Step 19/512...
Step 20/512...
Step 21/512...


KeyboardInterrupt: 

In [2]:
import os
import torch
from tqdm import tqdm
import time
from contextlib import contextmanager
from transformers import AutoModelForCausalLM, AutoTokenizer

@contextmanager
def timed():
    start = time.time()
    yield
    end = time.time()
    elapsed_time = end - start
    return elapsed_time

def forward(input_ids, model, tokenizer, max_steps=512):
    new_token = 0
    output_ids = input_ids

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)

        if tokenizer.eos_token_id in output_ids[0, input_ids.shape[1]:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, new_token, idx

# Load the base Vicuna model
model_name = 'lmsys/vicuna-7b-v1.3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the prompt
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Hi, could you share a tale about a charming llama that grows Medusa-like hair and starts its own coffee shop? ASSISTANT:"

# After loading the model
device = model.device  # Get the device of the model

# Ensure input_ids is on the same device as the model
input_ids = tokenizer([prompt]).input_ids
input_tensor = torch.as_tensor(input_ids).to(device)  # Move to the correct device

# Measure the total processing time
with torch.inference_mode():
    processing_time = timed()
    output_ids, new_token, idx = forward(
        input_tensor,
        model,
        tokenizer,
    )
    elapsed_time = processing_time.__enter__()
    processing_time.__exit__(None, None, None)

    output_ids = output_ids[0][len(input_ids[0]):]
    print("Output length:", output_ids.size(-1))
    tokens_per_second = idx / elapsed_time if elapsed_time > 0 else 0
    print("Tokens/second:", tokens_per_second)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

: 

In [7]:
import os
import torch
from tqdm import tqdm
import time
from contextlib import contextmanager
from transformers import AutoModelForCausalLM, AutoTokenizer

@contextmanager
def timed():
    start = time.time()
    yield
    end = time.time()
    elapsed_time = end - start
    return elapsed_time

def forward(input_ids, model, tokenizer, max_steps=512):
    new_token = 0
    output_ids = input_ids

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)

        if tokenizer.eos_token_id in output_ids[0, input_ids.shape[1]:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, new_token, idx

# Load the base Vicuna model
model_name = 'lmsys/vicuna-7b-v1.3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the prompt
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Hi, could you share a tale about a charming llama that grows Medusa-like hair and starts its own coffee shop? ASSISTANT:"

# After loading the model
device = model.device  # Get the device of the model

# Ensure input_ids is on the same device as the model
input_ids = tokenizer([prompt]).input_ids
input_tensor = torch.as_tensor(input_ids).to(device)  # Move to the correct device

# Measure the total processing time
with torch.inference_mode():
    processing_time = timed()
    output_ids, new_token, idx = forward(
        input_tensor,
        model,
        
        tokenizer,
    )
    elapsed_time = processing_time.__enter__()
    processing_time.__exit__(None, None, None)

    output_ids = output_ids[0][len(input_ids[0]):]
    print("Output length:", output_ids.size(-1))
    tokens_per_second = idx / elapsed_time if elapsed_time > 0 else 0
    print("Tokens/second:", tokens_per_second)


Loading checkpoint shards: 100%|██████████| 2/2 [00:09<00:00,  4.58s/it]


Step 1/512...


KeyboardInterrupt: 

In [2]:
import torch
import time
from transformers import AutoModelForCausalLM, AutoTokenizer

def forward(input_ids, model, tokenizer, max_steps=512):
    new_token = 0
    output_ids = input_ids.clone()  # Ensure we're modifying a copy

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)  # Greedy decoding
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)

        if tokenizer.eos_token_id in output_ids[0, input_ids.shape[1]:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, idx

# Load the base Vicuna model
model_name = 'lmsys/vicuna-7b-v1.3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the prompt
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Hi, could you share a tale about a charming llama that grows Medusa-like hair and starts its own coffee shop? ASSISTANT:"

# Ensure input_ids is on the same device as the model
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(model.device)  # Move to the correct device

# Measure the total processing time
start_time = time.time()
with torch.inference_mode():
    output_ids, idx = forward(input_ids, model, tokenizer)

elapsed_time = time.time() - start_time

output_ids = output_ids[0][len(input_ids[0]):]
print("Output length:", output_ids.size(-1))
tokens_per_second = idx / elapsed_time if elapsed_time > 0 else 0
print("Tokens/second:", tokens_per_second)


Loading checkpoint shards: 100%|██████████| 2/2 [00:05<00:00,  2.73s/it]
Some parameters are on the meta device because they were offloaded to the cpu.


Step 1/512...
Step 2/512...
Step 3/512...
Step 4/512...
Step 5/512...
Step 6/512...
Step 7/512...
Step 8/512...
Step 9/512...
Step 10/512...
Step 11/512...
Step 12/512...
Step 13/512...
Step 14/512...
Step 15/512...
Step 16/512...
Step 17/512...
Step 18/512...
Step 19/512...
Step 20/512...
Step 21/512...
Step 22/512...
Step 23/512...
Step 24/512...
Step 25/512...
Step 26/512...
Step 27/512...
Step 28/512...
Step 29/512...
Step 30/512...
Step 31/512...
Step 32/512...
Step 33/512...
Step 34/512...
Step 35/512...
Step 36/512...
Step 37/512...
Step 38/512...
Step 39/512...
Step 40/512...
Step 41/512...
Step 42/512...
Step 43/512...
Step 44/512...
Step 45/512...
Step 46/512...
Step 47/512...
Step 48/512...
Step 49/512...
Step 50/512...
Step 51/512...
Step 52/512...
Step 53/512...
Step 54/512...
Step 55/512...
Step 56/512...
Step 57/512...
Step 58/512...
Step 59/512...
Step 60/512...
Step 61/512...
Step 62/512...
Step 63/512...
Step 64/512...
Step 65/512...
Step 66/512...
Step 67/512...
Step

In [1]:
import torch
import time
import numpy as np
from transformers import AutoModelForCausalLM, AutoTokenizer

def forward(input_ids, model, tokenizer, max_steps=512):
    new_token = 0
    output_ids = input_ids.clone()  # Ensure we're modifying a copy
    wall_times = {'init': [], 'medusa': [], 'tree': [], 'posterior': [], 'update': []}

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        step_start = time.time()
        
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)  # Greedy decoding
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)

        step_time = time.time() - step_start
        wall_times['init'].append(step_time)  # Example: replace with actual timing for specific parts

        if tokenizer.eos_token_id in output_ids[0, input_ids.shape[1]:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, new_token, idx, wall_times

# Load the base Vicuna model
model_name = 'lmsys/vicuna-7b-v1.3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the prompt
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Hi, could you share a tale about a charming llama that grows Medusa-like hair and starts its own coffee shop? ASSISTANT:"

# Ensure input_ids is on the same device as the model
input_ids = tokenizer(prompt, return_tensors='pt').input_ids.to(model.device)  # Move to the correct device

# Measure the total processing time
start_time = time.time()
with torch.inference_mode():
    output_ids, new_token, idx, wall_times = forward(input_ids, model, tokenizer)

elapsed_time = time.time() - start_time

# Decode the output
output = tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Print output and timing information
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

# Calculate wall times
time_init = np.sum(wall_times['init'])
# Placeholder for other timing metrics; replace with actual timing data if available
time_medusa = 0  
time_tree = 0  
time_posterior = 0  
time_update = 0  
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

# Print timing results
print('=' * max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-' * max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total if time_total > 0 else 0, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total if time_total > 0 else 0, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total if time_total > 0 else 0, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total if time_total > 0 else 0, max_length))
print('-' * max_length)
print(format_string("Tokens/second: ", new_token / elapsed_time if elapsed_time > 0 else 0, max_length))
print('=' * max_length)

# Print the decoded output
print("Decoded output:", output)


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.08s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Step 1/512...
Step 2/512...
Step 3/512...
Step 4/512...
Step 5/512...
Step 6/512...
Step 7/512...
Step 8/512...
Step 9/512...
Step 10/512...
Step 11/512...
Step 12/512...
Step 13/512...
Step 14/512...
Step 15/512...
Step 16/512...
Step 17/512...
Step 18/512...
Step 19/512...
Step 20/512...
Step 21/512...
Step 22/512...
Step 23/512...
Step 24/512...
Step 25/512...
Step 26/512...
Step 27/512...
Step 28/512...
Step 29/512...
Step 30/512...
Step 31/512...
Step 32/512...
Step 33/512...
Step 34/512...
Step 35/512...
Step 36/512...
Step 37/512...
Step 38/512...
Step 39/512...
Step 40/512...
Step 41/512...
Step 42/512...
Step 43/512...
Step 44/512...
Step 45/512...
Step 46/512...
Step 47/512...
Step 48/512...
Step 49/512...
Step 50/512...
Step 51/512...
Step 52/512...
Step 53/512...
Step 54/512...
Step 55/512...
Step 56/512...
Step 57/512...
Step 58/512...
Step 59/512...
Step 60/512...
Step 61/512...
Step 62/512...
Step 63/512...
Step 64/512...
Step 65/512...
Step 66/512...
Step 67/512...
Step

In [2]:
def forward(input_ids, model, tokenizer, max_steps=512):
    new_token = 0
    output_ids = input_ids.clone()  # Ensure we're modifying a copy
    wall_times = {'init': [], 'medusa': [], 'tree': [], 'posterior': [], 'update': []}

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        step_start = time.time()
        
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)  # Greedy decoding
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)
        
        # Increment new_token for each generated token
        new_token += 1

        step_time = time.time() - step_start
        wall_times['init'].append(step_time)  # Example: replace with actual timing for specific parts

        if tokenizer.eos_token_id in output_ids[0, input_ids.shape[1]:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, new_token, idx, wall_times

# After calling forward
tokens_per_second = new_token / elapsed_time if new_token > 0 and elapsed_time > 0 else 0


In [1]:
import os
import torch
import numpy as np
from tqdm import tqdm
import time
from contextlib import contextmanager
from transformers import AutoModelForCausalLM, AutoTokenizer

@contextmanager
def timed():
    start = time.time()
    yield
    end = time.time()
    elapsed_time = end - start
    return elapsed_time

def forward(input_ids, model, tokenizer, max_steps=512):
    new_token = 0
    output_ids = input_ids.clone()  # Make a copy to avoid modifying the original

    for idx in range(max_steps):
        print(f"Step {idx + 1}/{max_steps}...")  # Debug statement
        with torch.no_grad():
            outputs = model(output_ids)
            logits = outputs.logits[:, -1, :]
        
        next_token = torch.argmax(logits, dim=-1)
        output_ids = torch.cat((output_ids, next_token.unsqueeze(0)), dim=1)

        # Increment new_token for each generated token
        new_token += 1

        if tokenizer.eos_token_id in output_ids[0, input_ids.shape[1]:].tolist():
            break

    print("Finished processing.")  # Debug statement
    return output_ids, new_token, idx

# Load the base Vicuna model
model_name = 'lmsys/vicuna-7b-v1.3'
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Set up the prompt
prompt = "A chat between a curious user and an artificial intelligence assistant. The assistant gives helpful, detailed, and polite answers to the user's questions. USER: Hi, could you share a tale about a charming llama that grows Medusa-like hair and starts its own coffee shop? ASSISTANT:"

# Ensure input_ids is on the same device as the model
input_ids = tokenizer([prompt]).input_ids
input_tensor = torch.as_tensor(input_ids).to(model.device)  # Move to the correct device

# Measure the total processing time
with torch.inference_mode():
    processing_time = timed()
    output_ids, new_token, idx = forward(
        input_tensor,
        model,
        tokenizer,
    )
    elapsed_time = processing_time.__enter__()
    processing_time.__exit__(None, None, None)

    output_ids = output_ids[0][len(input_ids[0]):]
    print("Output length:", output_ids.size(-1))
    tokens_per_second = new_token / elapsed_time if new_token > 0 and elapsed_time > 0 else 0
    print("Tokens/second:", tokens_per_second)

# Decode the output
output = tokenizer.decode(
    output_ids,
    spaces_between_special_tokens=False,
)
print("Decoded output:", output)

# Optional: Print wall times if you want to measure specific parts
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

# Example wall times (replace with actual values if needed)
time_init = 236.753
time_medusa = 0.0
time_tree = 0.0
time_posterior = 0.0
time_update = 0.0
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('=' * max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-' * max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-' * max_length)
print(format_string("Tokens/second: ", tokens_per_second, max_length))
print('=' * max_length)


  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(
  return torch.load(checkpoint_file, map_location=map_location)
Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.01s/it]
Some parameters are on the meta device because they were offloaded to the cpu.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


Step 1/512...
Step 2/512...
Step 3/512...
Step 4/512...
Step 5/512...
Step 6/512...
Step 7/512...
Step 8/512...
Step 9/512...
Step 10/512...
Step 11/512...
Step 12/512...
Step 13/512...
Step 14/512...
Step 15/512...
Step 16/512...
Step 17/512...
Step 18/512...
Step 19/512...
Step 20/512...
Step 21/512...
Step 22/512...
Step 23/512...
Step 24/512...
Step 25/512...
Step 26/512...
Step 27/512...
Step 28/512...
Step 29/512...
Step 30/512...
Step 31/512...
Step 32/512...
Step 33/512...
Step 34/512...
Step 35/512...
Step 36/512...
Step 37/512...
Step 38/512...
Step 39/512...
Step 40/512...
Step 41/512...
Step 42/512...
Step 43/512...
Step 44/512...
Step 45/512...
Step 46/512...
Step 47/512...
Step 48/512...
Step 49/512...
Step 50/512...
Step 51/512...
Step 52/512...
Step 53/512...
Step 54/512...
Step 55/512...
Step 56/512...
Step 57/512...
Step 58/512...
Step 59/512...
Step 60/512...
Step 61/512...
Step 62/512...
Step 63/512...
Step 64/512...
Step 65/512...
Step 66/512...
Step 67/512...
Step

TypeError: '>' not supported between instances of 'NoneType' and 'int'