In [9]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"
import torch
from tqdm import tqdm
import time
from contextlib import contextmanager
import numpy as np
from medusa.model.modeling_llama_kv import LlamaForCausalLM as KVLlamaForCausalLM
from medusa.model.medusa_model import MedusaModel, MedusaConfig
from medusa.model.kv_cache import *
from medusa.model.utils import *
from medusa.model.medusa_choices import *
import transformers
from huggingface_hub import hf_hub_download

In [10]:
@contextmanager
def timed(wall_times, key):
    start = time.time()
    torch.cuda.synchronize()
    yield
    torch.cuda.synchronize()
    end = time.time()
    elapsed_time = end - start
    wall_times[key].append(elapsed_time)

def medusa_forward(input_ids, model, tokenizer, medusa_choices, temperature, posterior_threshold, posterior_alpha, max_steps = 512):
    wall_times = {'medusa': [], 'tree': [], 'posterior': [], 'update': [], 'init': []}
    
    with timed(wall_times, 'init'):
        if hasattr(model, "medusa_choices") and model.medusa_choices == medusa_choices:
            medusa_buffers = model.medusa_buffers
        else:
            medusa_buffers = generate_medusa_buffers(
                medusa_choices, device=model.base_model.device
            )
        model.medusa_buffers = medusa_buffers
        model.medusa_choices = medusa_choices

        if hasattr(model, "past_key_values"):
            past_key_values = model.past_key_values
            past_key_values_data = model.past_key_values_data
            current_length_data = model.current_length_data
            current_length_data.zero_()
        else:
            (
                past_key_values,
                past_key_values_data,
                current_length_data,
            ) = initialize_past_key_values(model.base_model)
            model.past_key_values = past_key_values
            model.past_key_values_data = past_key_values_data
            model.current_length_data = current_length_data

        input_len = input_ids.shape[1]
        reset_medusa_mode(model)
        medusa_logits, logits = initialize_medusa(
                input_ids, model, medusa_buffers["medusa_attn_mask"], past_key_values
        )
    new_token = 0

    for idx in range(max_steps): 
        with timed(wall_times, 'medusa'):
            candidates, tree_candidates = generate_candidates(
                    medusa_logits,
                    logits,
                    medusa_buffers["tree_indices"],
                    medusa_buffers["retrieve_indices"],
                )

        with timed(wall_times, 'tree'):
            medusa_logits, logits, outputs = tree_decoding(
                    model,
                    tree_candidates,
                    past_key_values,
                    medusa_buffers["medusa_position_ids"],
                    input_ids,
                    medusa_buffers["retrieve_indices"],
                )

        with timed(wall_times, 'posterior'):
            best_candidate, accept_length = evaluate_posterior(
                    logits, candidates, temperature, posterior_threshold, posterior_alpha
                )
        
        with timed(wall_times, 'update'):
            input_ids, logits, medusa_logits, new_token = update_inference_inputs(
                    input_ids,
                    candidates,
                    best_candidate,
                    accept_length,
                    medusa_buffers["retrieve_indices"],
                    outputs,
                    logits,
                    medusa_logits,
                    new_token,
                    past_key_values_data,
                    current_length_data,
                )

        if tokenizer.eos_token_id in input_ids[0, input_len:].tolist():
            break

    return input_ids, new_token, idx, wall_times


In [11]:
model_name = 'FasterDecoding/medusa-vicuna-7b-v1.3'

config = MedusaConfig.from_pretrained(
    model_name,
    medusa_num_heads=4,
    medusa_num_layers=1,
)

model = MedusaModel.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)
tokenizer = model.get_tokenizer()

medusa_choices = mc_sim_7b_63



You are using a model of type llama to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
You are using a model of type llama to instantiate a model of type . This is not supported for all configurations of models and can yield errors.
You are using the default legacy behaviour of the <class 'transformers.models.llama.tokenization_llama.LlamaTokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thouroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565
  return torch.load(checkpoint_file, map_location=map_location)
Loading checkpoint shards: 100%|██████████| 2/2 [00:12<00:00,  6.38s/it]
Some weights of MedusaModelLlama were not initialized from the model checkpoint at lmsys/vicuna-7b-

In [12]:
temperature = 0.
posterior_threshold = 0.09
posterior_alpha = 0.3

prompt1

In [21]:
prompt = "Describe a bustling city street during the morning rush hour."

In [22]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 164
Compression ratio: tensor(1.0061, device='cuda:0')


In [23]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)


The bustling city street during the morning rush hour is a chaotic and noisy place. The air is thick with the smell of exhaust fumes and the sound of honking cars fills the air. People are rushing to and fro, hurrying to get to work on time. The street is crowded with people, cars, buses, and taxis, all jostling for space. The noise is deafening, with the sound of car horns, shouting, and the clanging of construction equipment. The street is a blur of movement, with people walking quickly and cars zooming by at high speeds. Despite the chaos, there is a sense of energy and purpose in the air, as people work towards their goals for the day.</s>


In [24]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              1.004
Wall time medusa:                            0.090
Wall time Tree:                            101.641
Wall time Posterior:                         0.096
Wall time Update:                            0.112
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.987
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.593


prompt2

In [17]:
prompt = "Write a short story about someone who discovers a hidden talent by accident."

In [18]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 512
Compression ratio: tensor(1.0020, device='cuda:0')


In [19]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)


Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talent by accident.
Write a short story about a person who discovers a hidden talen

In [20]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              1.042
Wall time medusa:                            0.274
Wall time Tree:                            311.708
Wall time Posterior:                         0.175
Wall time Update:                            0.422
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.994
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.633


prompt-3

In [29]:
prompt = "Describe a quiet beach at sunrise, with the sounds of waves and seagulls."

In [30]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 261
Compression ratio: tensor(1.0077, device='cuda:0')


In [31]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)



The beach is a quiet place, with a gentle slope that leads down to the water. The sand is soft and warm underfoot, and the air is filled with the scent of salt and seaweed. The sun is just beginning to rise, casting a warm golden light over the scene.

In the distance, you can hear the sound of waves crashing against the shore. The rhythmic sound is soothing and calming, and it helps to put you at ease. As you listen, you can see the waves rolling in, one after another, until they reach the shore and break into a million tiny pieces.

The seagulls are the other sound that fills the air. They are circling overhead, their cries echoing across the beach. Some of them are perched on the rocks, watching the waves with interest. Others are diving into the water, their bodies gliding through the waves with ease.

As you stand there, taking in the scene, you realize that this is a moment of peace and tranquility. The world around you is still and quiet, and you feel at one with nature. It's 

In [32]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              1.109
Wall time medusa:                            0.083
Wall time Tree:                            162.452
Wall time Posterior:                         0.127
Wall time Update:                            0.170
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.991
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.592


prompt4

In [9]:
prompt = "Explain how blockchain technology ensures data security."

In [10]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)

Output length: 193
Compression ratio: tensor(1.0052, device='cuda:0')


In [11]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)


Blockchain technology is a decentralized digital ledger that records transactions across a network of computers. Each block in the chain contains a cryptographic hash of the previous block, which creates a secure and tamper-proof chain of data.
The decentralized nature of blockchain technology means that there is no single point of failure, and the data is stored across a network of computers. This makes it difficult for hackers to compromise the system, as they would need to hack into multiple computers to alter the data.
Furthermore, the use of cryptographic algorithms and consensus mechanisms in blockchain technology ensures that any changes to the data are verified by the network, making it difficult for hackers to manipulate the data without being detected.
In summary, blockchain technology ensures data security by providing a decentralized and tamper-proof digital ledger that is resistant to hacking and manipulation.</s>


In [12]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.964
Wall time medusa:                            0.036
Wall time Tree:                            119.497
Wall time Posterior:                         0.066
Wall time Update:                            0.166
--------------------------------------------------
Wall time portion medusa:                    0.000
Wall time portion Tree:                      0.990
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.599


prompt5

In [13]:
prompt="Compare and contrast remote work with office-based work."

In [14]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)


Output length: 355
Compression ratio: tensor(1.0057, device='cuda:0')


In [15]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)


Remote work is a type of work that is done outside of a traditional office setting. This can include working from home, a coffee shop, or any other location that is not a traditional office. On the other hand, office-based work is done in a traditional office setting, typically during regular business hours.
One of the main advantages of remote work is the flexibility it offers. With remote work, employees can set their own schedules and work from anywhere, as long as they have a reliable internet connection. This can be especially beneficial for parents who need to balance work and family responsibilities, or for people who have mobility issues that make it difficult to commute to an office.
Office-based work, on the other hand, typically requires a more structured schedule and a physical presence in the office. This can be beneficial for employees who need to collaborate with colleagues in person, or who need to be available to answer phone calls or emails during regular business ho

In [21]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.832
Wall time medusa:                            0.112
Wall time Tree:                            146.780
Wall time Posterior:                         0.064
Wall time Update:                            0.220
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.992
Wall time portion Posterior:                 0.000
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.588


prompt6

In [16]:

prompt = "Write a step-by-step guide on how to bake a chocolate cake."

In [17]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)


Output length: 303
Compression ratio: tensor(1.0033, device='cuda:0')


In [18]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)


1. Preheat your oven to 180°C (350°F) and grease a 23cm (9in) cake tin.
2. In a large mixing bowl, beat the butter and sugar together until light and fluffy.
3. Add the eggs, one at a time, beating well after each addition.
4. Sift together the flour, cocoa powder, baking powder and salt.
5. Gradually add the dry ingredients to the butter mixture, alternating with the milk, beginning and ending with the dry ingredients.
6. Stir in the vanilla extract.
7. Pour the batter into the prepared cake tin and smooth the top.
8. Bake in the preheated oven for 45-50 minutes, or until a toothpick inserted into the center of the cake comes out clean.
9. Remove the cake from the oven and allow it to cool in the tin for 10 minutes.
10. Then, remove the cake from the tin and transfer it to a wire rack to cool completely.
11. Once cooled, you can decorate the cake with frosting and any other decorations of your choice.
12. Enjoy your delicious homemade chocolate cake!</s>


In [19]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              0.972
Wall time medusa:                            0.190
Wall time Tree:                            184.446
Wall time Posterior:                         0.128
Wall time Update:                            0.200
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.992
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.630


prompt7

In [33]:
prompt = "Picture a crowded subway station during the evening commute, with people rushing in all directions"

In [34]:
with torch.inference_mode():
    input_ids = tokenizer([prompt]).input_ids
    output_ids, new_token, idx, wall_time = medusa_forward(
                    torch.as_tensor(input_ids).cuda(),
                    model,
                    tokenizer,
                    medusa_choices,
                    temperature,
                    posterior_threshold,
                    posterior_alpha,
                )
    output_ids = output_ids[0][len(input_ids[0]) :]
    print("Output length:", output_ids.size(-1))
    print("Compression ratio:", new_token / idx)


Output length: 468
Compression ratio: tensor(1.0021, device='cuda:0')


In [35]:
output = tokenizer.decode(
                    output_ids,
                    spaces_between_special_tokens=False,
                )
print(output)

. Now imagine a group of performers dressed in brightly colored costumes, dancing and singing in the middle of the crowd. This is the scene that greeted visitors to the 2018 New York City Dance Parade, an annual event that celebrates the diversity and creativity of dance in the city.
The parade, which took place on May 6, featured a variety of dance styles and performers, from hip-hop to contemporary to salsa. The event was organized by the New York City Dance Alliance, a nonprofit organization that supports and promotes dance in the city.
“The Dance Parade is a celebration of dance in all its forms,” said Katie Swords, the executive director of the New York City Dance Alliance. “It’s a chance for people to come together and enjoy the art form in a fun and festive way.”
The parade began at 12:30 p.m. in the West Village and ended at Tompkins Square Park, where a dance festival took place. Along the way, performers stopped to dance and perform at various locations, including Times Squar

In [36]:
max_length = 50

def format_string(text, value, max_length):
    value_str = "{:.3f}".format(value)
    return f"{text:<{max_length - len(value_str)}}{value_str}"

time_init = np.sum(wall_time['init'] )
time_medusa = np.sum(wall_time['medusa'] )
time_tree = np.sum(wall_time['tree'] )
time_posterior = np.sum(wall_time['posterior'] )
time_update = np.sum(wall_time['update'] )
time_total = time_init + time_medusa + time_tree + time_posterior + time_update

print('='*max_length)
print(format_string("Wall time init: ", time_init, max_length))
print(format_string("Wall time medusa: ", time_medusa, max_length))
print(format_string("Wall time Tree: ", time_tree, max_length))
print(format_string("Wall time Posterior: ", time_posterior, max_length))
print(format_string("Wall time Update: ", time_update, max_length))
print('-'*max_length)
print(format_string("Wall time portion medusa: ", time_medusa / time_total, max_length))
print(format_string("Wall time portion Tree: ", time_tree / time_total, max_length))
print(format_string("Wall time portion Posterior: ", time_posterior / time_total, max_length))
print(format_string("Wall time portion Update: ", time_update / time_total, max_length))
print('-'*max_length)
print(format_string("Tokens/second: ", new_token / time_total, max_length))
print('='*max_length)

Wall time init:                              1.054
Wall time medusa:                            0.184
Wall time Tree:                            287.981
Wall time Posterior:                         0.167
Wall time Update:                            0.266
--------------------------------------------------
Wall time portion medusa:                    0.001
Wall time portion Tree:                      0.994
Wall time portion Posterior:                 0.001
Wall time portion Update:                    0.001
--------------------------------------------------
Tokens/second:                               1.616
