In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from tqdm import tqdm
import time
from contextlib import contextmanager
import numpy as np
from medusa.model.modeling_llama_kv import LlamaForCausalLM as KVLlamaForCausalLM
from medusa.model.medusa_model import MedusaModel, MedusaConfig
from medusa.model.kv_cache import *
from medusa.model.utils import *
from medusa.model.medusa_choices import *
import transformers
from huggingface_hub import hf_hub_download

  from .autonotebook import tqdm as notebook_tqdm
  torch.utils._pytree._register_pytree_node(
  torch.utils._pytree._register_pytree_node(


In [2]:
@contextmanager
def timed(wall_times, key):
    start = time.time()
    torch.cuda.synchronize()
    yield
    torch.cuda.synchronize()
    end = time.time()
    elapsed_time = end - start
    wall_times[key].append(elapsed_time)

def medusa_forward(input_ids, model, tokenizer, medusa_choices, temperature, posterior_threshold, posterior_alpha, max_steps = 512):
    wall_times = {'medusa': [], 'tree': [], 'posterior': [], 'update': [], 'init': []}
    
    with timed(wall_times, 'init'):
        if hasattr(model, "medusa_choices") and model.medusa_choices == medusa_choices:
            medusa_buffers = model.medusa_buffers
        else:
            medusa_buffers = generate_medusa_buffers(
                medusa_choices, device=model.base_model.device
            )
        model.medusa_buffers = medusa_buffers
        model.medusa_choices = medusa_choices

        if hasattr(model, "past_key_values"):
            past_key_values = model.past_key_values
            past_key_values_data = model.past_key_values_data
            current_length_data = model.current_length_data
            current_length_data.zero_()
        else:
            (
                past_key_values,
                past_key_values_data,
                current_length_data,
            ) = initialize_past_key_values(model.base_model)
            model.past_key_values = past_key_values
            model.past_key_values_data = past_key_values_data
            model.current_length_data = current_length_data

        input_len = input_ids.shape[1]
        reset_medusa_mode(model)
        medusa_logits, logits = initialize_medusa(
                input_ids, model, medusa_buffers["medusa_attn_mask"], past_key_values
        )
    new_token = 0

    for idx in range(max_steps): 
        with timed(wall_times, 'medusa'):
            candidates, tree_candidates = generate_candidates(
                    medusa_logits,
                    logits,
                    medusa_buffers["tree_indices"],
                    medusa_buffers["retrieve_indices"],
                )

        with timed(wall_times, 'tree'):
            medusa_logits, logits, outputs = tree_decoding(
                    model,
                    tree_candidates,
                    past_key_values,
                    medusa_buffers["medusa_position_ids"],
                    input_ids,
                    medusa_buffers["retrieve_indices"],
                )

        with timed(wall_times, 'posterior'):
            best_candidate, accept_length = evaluate_posterior(
                    logits, candidates, temperature, posterior_threshold, posterior_alpha
                )
        
        with timed(wall_times, 'update'):
            input_ids, logits, medusa_logits, new_token = update_inference_inputs(
                    input_ids,
                    candidates,
                    best_candidate,
                    accept_length,
                    medusa_buffers["retrieve_indices"],
                    outputs,
                    logits,
                    medusa_logits,
                    new_token,
                    past_key_values_data,
                    current_length_data,
                )

        if tokenizer.eos_token_id in input_ids[0, input_len:].tolist():
            break

    return input_ids, new_token, idx, wall_times


In [3]:
model_name = 'FasterDecoding/medusa-vicuna-7b-v1.3'

config = MedusaConfig.from_pretrained(
    model_name,
    medusa_num_heads=4,
    medusa_num_layers=1,
)

model = MedusaModel.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = model.get_tokenizer()

medusa_choices = mc_sim_7b_63



You are using a model of type llama to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
You are using a model of type llama to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return torch.load(checkpoint_file, map_location=map_location)
Loading checkpoint shards: 100%|██████████| 2/2 [00:08<00:00,  4.28s/it]
Some weights of MedusaModelLlama were not initialized from the model checkpoint at lmsys/vicuna-7b-

In [4]:
temperature = 0.
posterior_threshold = 0.09
posterior_alpha = 0.3

prompt1

In [6]:
prompt = "Scenario:You are a customer service agent for an e-commerce platform. A customer complains:'I ordered a smartphone, but the package contained headphones instead. This is unacceptable!' Question:How would you respond to calm the customer and resolve the issue?"

In [7]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 214
Compression ratio: tensor(1.0047, device='cuda:0')


In [8]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



Answer:

1. Empathize with the customer: "I'm sorry to hear that you received the wrong item. I understand how frustrating this must be for you."
2. Gather information: "Can you please confirm the order number and the correct item you ordered?"
3. Apologize and assure the customer: "I apologize for the inconvenience this has caused. We will definitely take care of this issue for you."
4. Offer a solution: "We will send a replacement for the correct item as soon as possible. Once it's shipped, please allow 5-7 business days for delivery."
5. Thank the customer for their patience: "Thank you for your understanding, and please let us know if there's anything else we can do to assist you."
6. Follow up: "We will keep you updated on the status of your replacement order. If you have any further concerns, please don't hesitate to reach out."</s>


In [9]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              1.484
Wall time medusa:                            0.120
Wall time Tree:                            132.558
Wall time Posterior:                         0.111
Wall time Update:                            0.149
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.986
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.592


prompt2

In [5]:
prompt = "Scenario:You are a project manager negotiating a deadline extension with a client. The client insists the project must be delivered by Friday, but your team needs an extra week."

In [6]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 346
Compression ratio: tensor(1.0029, device='cuda:0')


In [7]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)

How do you handle this situation?

Answer:

1. I would explain the situation to the client and provide a detailed plan for how we can meet the new deadline.
2. I would try to convince the client to accept the original deadline, even if it means working longer hours.
3. I would try to negotiate a compromise, such as delivering the project by Friday but with some compromises on the quality or scope of the work.
4. I would ignore the client's request and continue working on the project as originally planned.
5. I would prioritize the client's needs and deliver the project by Friday, even if it means compromising on quality or scope.

Correct Answer: 3. I would try to negotiate a compromise, such as delivering the project by Friday but with some compromises on the quality or scope of the work.

Explanation: As a project manager, it is important to balance the needs of the client with the needs of the project team. In this situation, it is important to try to negotiate a compromise that mee

In [8]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              1.503
Wall time medusa:                            0.240
Wall time Tree:                            216.017
Wall time Posterior:                         0.146
Wall time Update:                            0.208
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.990
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.586


prompt-3

In [10]:
prompt = "Scenario: Two employees in your team are arguing because one feels the other is not contributing equally to a shared task. Question: How would you mediate the situation to ensure both employees feel heard and the task gets completed on time?"

In [11]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 94
Compression ratio: tensor(1.0217, device='cuda:0')


In [12]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



Answer: As a project manager, I would first try to understand the root cause of the conflict. I would then facilitate a discussion between the two employees to ensure both parties feel heard and understood. I would encourage them to express their concerns and work together to find a solution that is fair and equitable for both parties. I would also ensure that the task is completed on time and that the team is working effectively towards the project's goals.</s>


In [13]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.953
Wall time medusa:                            0.049
Wall time Tree:                             57.655
Wall time Posterior:                         0.028
Wall time Update:                            0.064
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.981
Wall time portion Posterior:                 0.000
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.600


prompt4

In [14]:
prompt = "Scenario: You are being interviewed for a product management role. The interviewer asks: 'Tell me about a time when you had to make a tough decision with limited data.' Question: How would you respond to demonstrate problem-solving and decision-making skills?"

In [15]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 220
Compression ratio: tensor(1.0092, device='cuda:0')


In [16]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



Answer: In my previous role, we were working on a new feature for our product that was critical to our users. However, we had limited data to make a decision on which direction to take. I decided to conduct a small user survey to gather more insights. I created a survey with a few questions that would help us understand our users' needs and preferences. I distributed the survey to a small group of users and analyzed the results.

The survey results showed that our users were split between two options. I then used the data to make a decision that would benefit the majority of our users. I presented the results to the team and we decided to go with the option that would benefit the most users.

This decision was not easy, but it was necessary to ensure that we were meeting the needs of our users. By using data to make a decision, we were able to make a decision that was in the best interest of our users. This experience taught me the importance of using data to make decisions, even whe

In [17]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.967
Wall time medusa:                            0.129
Wall time Tree:                            139.002
Wall time Posterior:                         0.077
Wall time Update:                            0.044
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.991
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.000
--------------------------------------------------
Tokens/second:                               1.569


prompt5

In [18]:
prompt="Scenario: A friend tells you:'I’ve been feeling overwhelmed with my work and studies lately. I don’t know how to manage everything.' Question: What advice or support would you offer to your friend?"

In [19]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)


Output length: 235
Compression ratio: tensor(1.0043, device='cuda:0')


In [20]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



Answer: I would offer my friend the following advice and support:

1. Encourage them to take a break: It's important for your friend to take a break from their work and studies to avoid burnout. I would suggest they take a short walk, do some light exercise, or engage in a hobby they enjoy.
2. Help them prioritize their tasks: I would suggest they make a to-do list and prioritize their tasks based on their importance and urgency. This will help them focus on the most important tasks and avoid feeling overwhelmed.
3. Offer to help them: I would offer to help my friend by taking on some of their tasks or providing them with additional resources to help them manage their workload.
4. Encourage them to seek professional help: If my friend's feelings of overwhelm persist, I would encourage them to seek professional help from a counselor or therapist.

Overall, my advice would be to focus on self-care, prioritize tasks, and seek support when needed.</s>


In [21]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.832
Wall time medusa:                            0.112
Wall time Tree:                            146.780
Wall time Posterior:                         0.064
Wall time Update:                            0.220
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.992
Wall time portion Posterior:                 0.000
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.588


prompt6

In [22]:

prompt = "Scenario: You are a team lead, and one of your team members tells you, 'I feel like my ideas are not being valued in meetings, and it's frustrating me.' Question: How would you respond to the team member to address their concerns and foster a more inclusive environment?"

In [23]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)


Output length: 415
Compression ratio: tensor(1.0024, device='cuda:0')


In [24]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



Answer:

1. Active listening: "I appreciate you sharing your feelings with me. It's important to create an environment where everyone's ideas are valued and heard."
2. Validate their experience: "I understand that it can be frustrating when your ideas aren't being considered. Let's work together to find a solution that ensures everyone's contributions are taken into account."
3. Address the issue: "I will make sure to encourage the team to actively listen to and consider all ideas, including yours. Let's work together to create a more inclusive environment."
4. Offer support: "I'm here to support you and help you feel heard. Let's discuss ways to improve the situation and ensure your ideas are valued."
5. Set expectations: "I expect all team members to be heard and valued in our meetings. Let's work together to create a more inclusive environment where everyone's ideas are considered."
6. Follow up: "I'll check in with you regularly to ensure the situation is improving and that your 

In [25]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.907
Wall time medusa:                            0.248
Wall time Tree:                            264.237
Wall time Posterior:                         0.157
Wall time Update:                            0.396
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.994
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.560


prompt7

In [30]:
prompt = "You are a manager giving constructive feedback to an employee who missed several deadlines in the past month. Question: How would you approach the conversation to motivate the employee while addressing the missed deadlines?"

In [31]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)


Output length: 217
Compression ratio: tensor(1.0046, device='cuda:0')


In [32]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



Answer: To motivate the employee while addressing the missed deadlines, I would approach the conversation by first acknowledging their efforts and accomplishments. Then, I would explain the impact of the missed deadlines on the team and the organization, and how it affects the overall productivity and efficiency. I would also provide specific examples of how their missed deadlines have affected the team and the project.

Next, I would discuss the expectations and goals for the employee and the team, and how meeting deadlines is crucial to achieving those goals. I would also provide specific examples of how meeting deadlines can benefit the employee and the team.

Finally, I would provide specific and actionable feedback on how the employee can improve their performance and meet future deadlines. I would also provide resources and support to help the employee improve their skills and knowledge.

Overall, my approach would be to provide constructive feedback that is specific, actionabl

In [33]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.924
Wall time medusa:                            0.129
Wall time Tree:                            135.524
Wall time Posterior:                         0.134
Wall time Update:                            0.189
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.990
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.585
