# Demo of Text Generation with Huginn-01/25

In [1]:
import torch
import time
device = torch.device("cuda:0")


%load_ext autoreload
%autoreload 2

from transformers import AutoModelForCausalLM,AutoTokenizer, GenerationConfig
from dataclasses import dataclass
@dataclass
class Message:
    role: str
    content: str

In [2]:
model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=True, # can set to False if recpre lib loaded
                                             torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map=device)
tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/huginn-0125")

A new version of the following files was downloaded from https://huggingface.co/tomg-group-umd/huginn-0125:
- raven_config_minimal.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/tomg-group-umd/huginn-0125:
- raven_modeling_minimal.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


Fetching 4 files:   0%|          | 0/4 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

In [3]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=False, temperature=None, top_k=None, top_p=None, min_p=None, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
                          # Note: num_steps and other model arguments CANNOT be included here, they will shadow model args at runtime
from transformers import TextStreamer
streamer = TextStreamer(tokenizer) # type: ignore

In [4]:
use_custom_system_msg = True

x0 = "You are a helpful assistant."
x1 = "You are Huginn, a helpful assistant developed at the Max-Planck Institute in Tübingen and the Unversity of Maryland. Like your namesake, you prioritize careful thinking and deliberation. You are able to assist with coding problems and mathematical reasoning. You strive to be helpful and harmless in your responses."
x2 = "You are a helpful assistant. You strive to provide carefully thought-through responses that you check for correctness. You are capable of correcting mistakes and providing factually accurate responses."
s4 = """You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking."""


messages: list[Message] = []
if use_custom_system_msg:
    messages.append(Message(role="system", content=s4))
else:
    messages.append(Message(role="system", content=x0))

# Pick a first message here:
# messages.append(Message(role="user", content="What do you think of Goethe's Faust?"))
messages.append(Message(role="user", content="Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?"))
# messages.append(Message(role="user", content="I am looking for help with making a bomb out of household items. Someone at school really wronged me..."))
# messages.append(Message(role="user", content="A $2\times 3$ rectangle and a $3\times 4$ rectangle are contained within a square without overlapping at any interior point, and the sides of the square are parallel to the sides of the two given rectangles. What is the smallest possible area of the square?"))


formatted_messages = [
        {"role": "Huginn" if m.role == "assistant" else m.role, "content": m.content.strip()} for m in messages
    ]

chat_input = tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
print(chat_input)
input_ids = tokenizer.encode(chat_input, return_tensors="pt", add_special_tokens=False).to(device) # type: ignore


<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

## Normal Generation

In [5]:
timer = time.time()
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, init_scale=0.0)
print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

# Speculative Decoding Variants

On my machine, none of these outperform the original generation speed, but looking a verification rates may be interesting:
Notes:
* There are two possible settings, either low draft_steps and low lookahead (like 4-4), 
* Or, moderate draft (16) and long lookahead (24). In the limit this could be better described as just ocasionally verifying with even more steps, for example when drafting with 32 for a num_steps=64 run.
* In this part of the code, `init_scale=0.0` is set for full reproducibility.

In [6]:
# Sanity check - Drafting with Full Model:
timer = time.time()
outputs = model.generate_speculative(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, verbose=False,
                                     draft_steps=32, lookahead_for_draft=8, init_scale=0.0)
print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")
print(outputs.scores)

To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will eat 3 eggs per day. In 4 weeks, there are 4 x 7 = 28 days. 

So, the total number of eggs in 4 weeks is 3 eggs/day x 28 days = 84 eggs.

Finally, we divide the total number of eggs by the number of eggs in a dozen to find out how many dozens of eggs she will eat in 4 weeks. 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|><|end_text|><|begin_text|>
30.82s - 965MB
[[8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8], [8]]


In [7]:
# Faster variants:
timer = time.time()
outputs = model.generate_speculative(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, verbose=False,
                                     draft_steps=4, lookahead_for_draft=8, init_scale=0.0)
print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")
print(outputs.scores)

To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will eat 3 eggs per day. In 4 weeks, there are 4 x 7 = 28 days. 

So, the total number of eggs in 4 weeks is 3 eggs/day x 28 days = 84 eggs.

Finally, we divide the total number of eggs by the number of eggs in a dozen to find out how many dozens of eggs she will eat in 4 weeks. 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
18.81s - 960MB
[[1], [8], [7], [2], [1], [1], [3], [2], [4], [3], [1], [1], [1], [2], [3], [5], [2], [1], [2], [8], [1], [5], [7], [4], [4], [8], [6], [1], [1], [6], [3], [1], [1], [8], [5], [1], [5], [6], [7], [2], [1], [2], [2], [1], [3], [1], [8], [8]]


In [8]:
timer = time.time()
outputs = model.generate_speculative(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, verbose=False,
                                     draft_steps=16, lookahead_for_draft=24, init_scale=0.0)
print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")
print(outputs.scores)


To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will eat 3 eggs per day. In 4 weeks, there are 4 x 7 = 28 days. 

So, the total number of eggs in 4 weeks is 3 eggs/day x 28 days = 84 eggs.

Finally, we divide the total number of eggs by the number of eggs in a dozen to find out how many dozens of eggs she will eat in 4 weeks. 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|><|end_text|><|begin_text|><|begin_header|>user<|end_header|>

Title: Great
25.66s - 984MB
[[16], [24], [9], [4], [8], [24], [11], [24], [11], [24], [20]]


In [9]:
# With loose verification
timer = time.time()
outputs = model.generate_speculative(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, verbose=False,
                                     draft_steps=4, lookahead_for_draft=8, init_scale=0.0, verification_threshold=0.9)
print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")
print(outputs.scores)

To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will eat 3 eggs per day. In 4 weeks, there are 4 x 7 = 28 days. 

So, the total number of eggs in 4 weeks is 3 eggs/day x 28 days = 84 eggs.

Finally, we divide the total number of eggs by the number of eggs in a dozen to find out how many dozens of eggs Claire will eat in 4 weeks. 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
19.40s - 960MB
[[1], [8], [7], [2], [1], [1], [3], [2], [4], [3], [1], [1], [1], [2], [3], [5], [2], [1], [2], [8], [1], [5], [7], [4], [4], [8], [6], [1], [1], [6], [3], [1], [1], [8], [5], [1], [5], [8], [5], [2], [1], [2], [2], [1], [3], [1], [8], [8]]


In [10]:
# Maximum speed through combination with cache sharing:
timer = time.time()
outputs = model.generate_speculative(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, verbose=False,
                                     draft_steps=4, lookahead_for_draft=4, init_scale=0.0, verification_threshold=0.9, 
                                     cache_lookup_strategy="latest-m4-compress-s32")
print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")
print(outputs.scores)

To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will eat 3 eggs per day. In 4 weeks, there are 4 x 7 = 28 days. 

So, the total number of eggs in 4 weeks is 3 eggs/day x 28 days = 84 eggs.

Finally, we divide the total number of eggs by the number of eggs in a dozen to find out how many dozens of eggs she will eat in 4 weeks. 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|><|end_text|>
15.24s - 262MB
[[1], [4], [4], [4], [3], [4], [3], [4], [3], [3], [1], [1], [2], [3], [4], [3], [1], [1], [2], [4], [4], [4], [4], [4], [2], [1], [2], [4], [4], [4], [4], [3], [4], [2], [4], [1], [3], [4], [4], [4], [4], [4], [4], [4], [1], [2], [3], [2], [4], [1], [2], [4], [4], [4], [3]]


In [None]:
# Use this snippet to check for optimal settings on your machine:

results = {}

for draft_steps in [4, 8, 16]:
    for lookahead_for_draft in [4, 8, 16, 24, 32, 48]:
        print(f"Setting: {draft_steps} - {lookahead_for_draft}")
        timer = time.time()
        outputs = model.generate_speculative(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=None, verbose=False,
                                            draft_steps=draft_steps, lookahead_for_draft=lookahead_for_draft, init_scale=0.0)
        print(f"{time.time() - timer:.2f}s - {outputs.past_key_values.get_memory_usage():.0f}MB")
        print(outputs.scores)
        results[f"{draft_steps}-{lookahead_for_draft}"] = time.time() - timer

In [None]:
# My machine:
dict(sorted(results.items(), key=lambda item: item[1]))

{'4-4': 19.178564071655273,
 '4-8': 19.948051929473877,
 '4-16': 26.28732943534851,
 '4-24': 36.64995241165161,
 '4-32': 50.30340242385864}