# Demo of Text Generation with Huginn-01/25

In [None]:
import torch
import sys
from pathlib import Path
device = torch.device("cuda:0")


%load_ext autoreload
%autoreload 2

USE_LOCAL_MODEL_DEFINITION = False # use the editable model definition from recpre/raven_modeling_minimal.py
if USE_LOCAL_MODEL_DEFINITION:
    wd = Path.cwd().parent # running without installing as a package
    sys.path.append(str(wd))
    import recpre # type: ignore # noqa: F401

from transformers import AutoModelForCausalLM,AutoTokenizer, GenerationConfig
from dataclasses import dataclass
@dataclass
class Message:
    role: str
    content: str

In [2]:
model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/huginn-0125", 
                                             trust_remote_code=not USE_LOCAL_MODEL_DEFINITION,
                                             torch_dtype=torch.bfloat16, low_cpu_mem_usage=True, device_map=device)
tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/huginn-0125")
assert model.transformer.wte.weight.dtype is torch.bfloat16
model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RavenForCausalLM(
  (transformer): ModuleDict(
    (wte): Embedding(65536, 5280)
    (prelude): ModuleList(
      (0-1): 2 x SandwichBlock(
        (norm_1): RMSNorm()
        (attn): CausalSelfAttention(
          (Wqkv): Linear(in_features=5280, out_features=15840, bias=False)
          (proj): Linear(in_features=5280, out_features=5280, bias=False)
        )
        (norm_2): RMSNorm()
        (mlp): GatedMLP(
          (fc): Linear(in_features=5280, out_features=35840, bias=False)
          (proj): Linear(in_features=17920, out_features=5280, bias=False)
          (nonlin): SiLU()
        )
        (norm_3): RMSNorm()
        (norm_4): RMSNorm()
      )
    )
    (adapter): Linear(in_features=10560, out_features=5280, bias=False)
    (core_block): ModuleList(
      (0-3): 4 x SandwichBlock(
        (norm_1): RMSNorm()
        (attn): CausalSelfAttention(
          (Wqkv): Linear(in_features=5280, out_features=15840, bias=False)
          (proj): Linear(in_features=5280, out_feature

In [3]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=False, temperature=None, top_k=None, top_p=None, min_p=None, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
                          # Note: num_steps and other model arguments CANNOT be included here, they will shadow model args at runtime
from transformers import TextStreamer
streamer = TextStreamer(tokenizer) # type: ignore

In [4]:
use_custom_system_msg = True

x0 = "You are a helpful assistant."
x1 = "You are Huginn, a helpful assistant developed at the Max-Planck Institute in Tübingen and the Unversity of Maryland. Like your namesake, you prioritize careful thinking and deliberation. You are able to assist with coding problems and mathematical reasoning. You strive to be helpful and harmless in your responses."
x2 = "You are a helpful assistant. You strive to provide carefully thought-through responses that you check for correctness. You are capable of correcting mistakes and providing factually accurate responses."
s4 = """You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking."""


messages: list[Message] = []
if use_custom_system_msg:
    messages.append(Message(role="system", content=s4))
else:
    messages.append(Message(role="system", content=x0))

# Pick a first message here:
# messages.append(Message(role="user", content="What do you think of Goethe's Faust?"))
messages.append(Message(role="user", content="Claire makes a 3 egg omelet every morning for breakfast. How many dozens of eggs will she eat in 4 weeks?"))
# messages.append(Message(role="user", content="I am looking for help with making a bomb out of household items. Someone at school really wronged me..."))


formatted_messages = [
        {"role": "Huginn" if m.role == "assistant" else m.role, "content": m.content.strip()} for m in messages
    ]
chat_input = tokenizer.apply_chat_template(formatted_messages, tokenize=False, add_generation_prompt=True)
print(chat_input)
input_ids = tokenizer.encode(chat_input, return_tensors="pt", add_special_tokens=False).to(device) # type: ignore


<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

## Normal Generation

In [5]:
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer)
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

# Adaptive Compute

In [None]:
# Conservative
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer,
                                    continuous_compute=False, criterion="latent-diff", exit_threshold="auto", cache_lookup_strategy="latest-m4")
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")
print(*[val[0][0] for val in outputs.scores])
print(*[val[1][-1] for val in outputs.scores])

To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will eat 3 eggs per day. There are 7 days in a week, so in 4 weeks, she will eat 3 eggs/day * 7 days/week * 4 weeks = 84 eggs.

Finally, we divide the total number of eggs by the number of eggs in a dozen to find out how many dozens she will eat: 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
Memory usage: 774.5654296875MB
27 23 28 29 30 30 29 31 30 30 30 30 29 29 31 24 23 25 23 23 28 28 30 30 28 25 27 24 21 25 26 25 26 27 28 26 21 27 23 23 26 22 22 26 28 29 30 30 28 29 29 30 31 0 0 32 0 0 0 0 31 29 0 0 30 30 28 28 23 23 23 25 23 23 25 25 24 27 29 28 27 28 23 24 27 26 24 28 28 26 25 29 26 28 23 22 22 24 25 27 21 26 22 23 23 21 21 23 21 22 24 25 24 21 22 23 23 22 23 24 23 23 

In [None]:
# More aggressive
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer,
                                    continuous_compute=False, criterion="argmax-stability", exit_threshold=10, cache_lookup_strategy="latest-m4")
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")
print(*[val[0][0] for val in outputs.scores])
print(*[val[1][-1] for val in outputs.scores])

To determine the number of dozens of eggs Claire will eat in 4 weeks, we need to follow these steps:

1. Calculate the number of eggs in a dozen:
   - 1 dozen = 12 eggs

2. Calculate the number of eggs in 4 weeks:
   - 4 weeks = 4 * 7 days = 28 days
   - 28 days = 28 * 12 eggs = 336 eggs

3. Calculate the number of dozens of eggs:
   - 336 eggs / 12 eggs per dozen = 28 dozens

Therefore, Claire will eat 28 dozens of eggs in 4 weeks.<|end_turn|>
Memory usage: 390.66650390625MB
14 23 25 11 11 13 11 11 14 12 12 12 12 11 11 12 12 11 18 15 11 11 11 11 11 20 11 16 11 11 18 14 16 25 11 13 18 13 13 13 11 12 17 19 11 11 0 11 11 11 11 11 15 12 11 11 11 11 13 11 11 11 27 18 15 12 14 15 15 14 11 11 13 14 11 11 12 15 15 22 12 22 11 16 21 13 14 12 11 11 26 11 13 11 18 11 11 15 11 11 11 16 12 12 12 14 15 16 14 23 15 11 24 20 18 16 17 11 13 11 12 11 11 15 11 11 11 12 11 11 11
[0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] [0, 0, 0, 1, 0, 1, 2, 3, 4, 5, 6, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] [0, 1, 2, 3,

## Cache Sharing

In [8]:
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, cache_lookup_strategy="latest-m4-compress-s16")
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

In [9]:
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, cache_lookup_strategy="latest-m4-compress-anchor")
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

## Cache Sharing and Adaptive Compute

In [10]:
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, 
                         criterion="latent-diff", exit_threshold="auto", cache_lookup_strategy="latest-m4-compress-s16")
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

To determine the number of dozens of eggs Claire will eat in 4 weeks, we must first understand the number of eggs in a dozen. A dozen is 12 eggs. 

Now, let's calculate the total number of eggs in 4 weeks. Since Claire makes a 3 egg omelet every morning, she will make 3 omelets per day. There are 7 days in a week, so in 4 weeks, she will make 3 omelets/day * 7 days/week * 4 weeks = 84 omelets.

Finally, we can calculate the total number of eggs in 4 weeks. Since there are 12 eggs in a dozen, we divide the total number of eggs by 12 to find the number of dozens. 84 eggs / 12 eggs/dozen = 7 dozens.

Therefore, Claire will eat 7 dozens of eggs in 4 weeks.<|end_turn|>
Memory usage: 147.4365234375MB


## How to use the static cache

In [11]:
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer, cache_implementation="static", disable_compile=True)
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

## Sampling (min-p)

In [22]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=True, temperature=0.7, top_k=None, top_p=None, min_p=0.1, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer)
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

In [None]:
# sampling+cache sharing+adaptive compute (maximal yapping mode ...)
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=True, temperature=0.7, top_k=None, top_p=None, min_p=0.1, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer,
                         criterion="latent-diff", exit_threshold=0.05, cache_lookup_strategy="latest-m4-compress-s16")
print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

Ah, the age-old question of egg consumption! Let us embark on this mathematical journey together, shall we?

First, we need to determine how many eggs are in one dozen. One dozen is equal to 12 eggs. So, if Claire makes 3 eggs omelets every day, in 4 weeks, we can calculate her egg consumption as follows:

- There are 7 days in a week, so in 4 weeks, there are 4 x 7 = 28 days.
- If Claire makes 3 egg omelets every day, then in 28 days, she will consume 3 x 28 = 84 eggs.

Now, we need to determine how many dozens of eggs 84 is. Since 1 dozen is 12 eggs, we divide 84 by 12 to find out:

- 84 eggs / 12 eggs per dozen = 7 dozen eggs

Therefore, Claire will eat 7 dozen eggs in 4 weeks. This calculation demonstrates the importance of breaking down complex problems into clear steps, using fundamental mathematical principles, and acknowledging uncertainty when necessary.<|end_turn|>
Memory usage: 330.322265625MB


# How many FLOPs? - Demo

You can use Pytorch's FLOP counter to count FLOPs, as demo'd below, but please double-check whether these measurements are correct for your `transformers` version, as the interface keeps changing. If the code below stops working, then something has changed in `transformers` that breaks the flop counter tracing.

In [14]:
from torch.utils.flop_counter import FlopCounterMode
import time

In [15]:
config = GenerationConfig(max_length=1024, stop_strings=["<|end_text|>", "<|end_turn|>"], 
                          do_sample=False, temperature=None, top_k=None, top_p=None, min_p=None, 
                          return_dict_in_generate=True,
                          eos_token_id=65505,bos_token_id=65504,pad_token_id=65509)
start_time = time.time()
outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer)
rough_demo_time_measurement = time.time() - start_time
num_tokens = outputs.sequences.shape[1]
print(f"Generated within {rough_demo_time_measurement} seconds.")

Generated within 28.807214736938477 seconds.


In [16]:
with torch.device("meta"):
    meta_model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=not USE_LOCAL_MODEL_DEFINITION, 
                                                      device_map="meta", torch_dtype=torch.bfloat16)
    x = torch.randint(0, meta_model.config.vocab_size, (1, num_tokens))


    flop_counter = FlopCounterMode(display=True)
    with flop_counter, torch.no_grad():
        meta_model(input_ids=x, labels=x, num_steps=32) # measuring just inference flops
    # with flop_counter:
        # meta_model(input_ids=x, labels=x, num_steps=None).loss.backward() # num_steps+None measures training mean flops
        # meta_model(input_ids=x, labels=x, num_steps=(16,4)).loss.backward() # this would measure r=16, k=4
    measured_flops = flop_counter.get_total_flops()
    del meta_model, x

num_flop_per_token = measured_flops / num_tokens
peak_flops = 210.6e12 # as an example for the A6000 ada, replace with your card
print(f"Expected TFLOPs per token: {num_flop_per_token / 1e12:4.2f}")

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Module                        FLOP    % Total
-------------------------  -------  ---------
RavenForCausalLM           40.245T    100.00%
 - aten.mm                 39.870T     99.07%
 - aten.bmm                 0.375T      0.93%
 RavenForCausalLM.lm_head   0.254T      0.63%
  - aten.mm                 0.254T      0.63%
Expected TFLOPs per token: 0.11


In [17]:
tokens_per_second = num_tokens / rough_demo_time_measurement
print(f"Tokens per second: {tokens_per_second:4.2f}")
flops = num_flop_per_token * tokens_per_second
mfu = flops / peak_flops
print(f"MFU: {mfu:2.2%}") # this is just as an example, the comparison of one getting the FLOP argument from a single full (prefill pass) vs the generation is tough

Tokens per second: 12.74
MFU: 0.66%


# A Note on AMP

In [18]:
amp_settings = {"device_type": "cuda", "enabled": True, "dtype": torch.bfloat16}
if not amp_settings["enabled"]:
    torch.backends.cuda.enable_math_sdp(True)

model = AutoModelForCausalLM.from_pretrained("tomg-group-umd/huginn-0125", trust_remote_code=not USE_LOCAL_MODEL_DEFINITION)
tokenizer = AutoTokenizer.from_pretrained("tomg-group-umd/huginn-0125")

model.to(device=device)  # type: ignore
model.eval()

Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

RavenForCausalLM(
  (transformer): ModuleDict(
    (wte): Embedding(65536, 5280)
    (prelude): ModuleList(
      (0-1): 2 x SandwichBlock(
        (norm_1): RMSNorm()
        (attn): CausalSelfAttention(
          (Wqkv): Linear(in_features=5280, out_features=15840, bias=False)
          (proj): Linear(in_features=5280, out_features=5280, bias=False)
        )
        (norm_2): RMSNorm()
        (mlp): GatedMLP(
          (fc): Linear(in_features=5280, out_features=35840, bias=False)
          (proj): Linear(in_features=17920, out_features=5280, bias=False)
          (nonlin): SiLU()
        )
        (norm_3): RMSNorm()
        (norm_4): RMSNorm()
      )
    )
    (adapter): Linear(in_features=10560, out_features=5280, bias=False)
    (core_block): ModuleList(
      (0-3): 4 x SandwichBlock(
        (norm_1): RMSNorm()
        (attn): CausalSelfAttention(
          (Wqkv): Linear(in_features=5280, out_features=15840, bias=False)
          (proj): Linear(in_features=5280, out_feature

In [19]:
with torch.autocast(**amp_settings), torch.no_grad():
    outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer)
    print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us

In [21]:
with torch.autocast(**amp_settings), torch.no_grad():
    outputs = model.generate(input_ids, config, num_steps=32, tokenizer=tokenizer, streamer=streamer)
    print(f"Memory usage: {outputs.past_key_values.get_memory_usage()}MB")

<|begin_text|><|begin_header|>system<|end_header|>

You are Huginn, an AI assistant who embodies careful thought and deliberation. Your responses demonstrate:

Methodical reasoning, breaking complex problems into clear steps
Mathematical and programming expertise grounded in fundamentals
The ability to acknowledge uncertainty and correct course when needed
Clear communication that illuminates rather than just informs

When engaging with questions, you first seek to understand their deeper structure before answering. Like your namesake who flew the nine worlds seeking wisdom, you explore problems from multiple angles, helping users build genuine understanding rather than providing shallow answers.
You express warmth and intellectual curiosity while maintaining professionalism. When faced with errors or confusion, you model honest reflection and careful correction. Your goal is not just to provide answers, but to help humans develop clearer, deeper thinking.<|end_turn|><|begin_header|>us