In [None]:
import numpy as np
import os
from dotenv import load_dotenv, find_dotenv

In [108]:
load_dotenv(dotenv_path=".env")
api_key = os.getenv('GOODFIRE_API_KEY')

In [109]:
import goodfire
client = goodfire.Client(api_key=api_key)

variant = goodfire.Variant("meta-llama/Meta-Llama-3.1-8B-Instruct")


# edits = client.features.AutoSteer(
#     # specification="Talk with a lot of puns",
#     model = variant
# )
print(variant)



Variant(
   base_model=meta-llama/Meta-Llama-3.1-8B-Instruct,
   edits={
   }
   scopes={
   }
)


# FORWARD PASSES

In [20]:
output = client.chat.completions.create(
    [{"role": "user", "content": "Tell me about godzilla"}],
    model = variant,
    stream = True,
    max_completion_tokens=80
)

In [53]:
act = client.features.activations([{"role": "user", "content": message_text}], model=variant)

In [63]:
inspector = client.features.inspect(msgs, model=variant, aggregate_by="sum")

---

In [21]:
chunks = list(output)

In [62]:
role, message_text = extract_message(chunks)
(message_text)

'Godzilla! The King of the Monsters. He\'s a giant, fire-breathing, city-stomping reptile from Japanese legend. Created by Tomoyuki Tanaka in 1954, Godzilla was meant to be a metaphor for the atomic age and the fears of nuclear destruction.\n\nGodzilla\'s real name is Gojira, and he\'s often referred to as "King Kong\'s nemesis." He\'s typically around 100-150 meters tall, with incredible strength and speed. He\'s also highly resistant to radiation and can even withstand nuclear explosions.\n\nOver 65 years, there have'

In [25]:
full_text = []
for ch in chunks:
    delta = ch.choices[0].delta.content or ""
    print(delta,end="")
    full_text.append(delta)
text = "".join(full_text)

Godzilla! The King of the Monsters. He's a giant, fire-breathing, city-stomping reptile from Japanese legend. Created by Tomoyuki Tanaka in 1954, Godzilla was meant to be a metaphor for the atomic age and the fears of nuclear destruction.

Godzilla's real name is Gojira, and he's often referred to as "King Kong's nemesis." He's typically around 100-150 meters tall, with incredible strength and speed. He's also highly resistant to radiation and can even withstand nuclear explosions.

Over 65 years, there have

In [54]:
features = client.features.search(
    "Godzilla",
    model = variant,
    top_k=5
)
features
cols = [f.index_in_sae for f in features]

feat_act = act[:,cols]
feat_act

array([[0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.        , 0.        , 0.        ],
       [0.        , 0.        , 0.

In [48]:
def extract_message(chunks, choice_idx=0):
    role = "assistant"
    parts = []
    for ch in chunks:
        delta = ch.choices[choice_idx].delta
        if getattr(delta, "role", None):
            role = delta.role
        if getattr(delta, "content", None):
            parts.append(delta.content)
    text = "".join(parts)
    return role, text

In [65]:
tokens = inspector.tokens

In [75]:
tokens

[Token("<|begin_of_text|>"),
 Token("<|start_header_id|>"),
 Token("system"),
 Token("<|end_header_id|>"),
 Token("
 
 "),
 Token("Cut"),
 Token("ting"),
 Token(" Knowledge"),
 Token(" Date"),
 Token(":"),
 Token(" December"),
 Token(" "),
 Token("202"),
 Token("3"),
 Token("
 "),
 Token("Today"),
 Token(" Date"),
 Token(":"),
 Token(" "),
 Token("26"),
 Token(" Jul"),
 Token(" "),
 Token("202"),
 Token("4"),
 Token("
 
 "),
 Token("<|eot_id|>"),
 Token("<|start_header_id|>"),
 Token("user"),
 Token("<|end_header_id|>"),
 Token("
 
 "),
 Token("God"),
 Token("zilla"),
 Token("!"),
 Token(" The"),
 Token(" King"),
 Token(" of"),
 Token(" the"),
 Token(" Monsters"),
 Token("."),
 Token(" He"),
 Token("'s"),
 Token(" a"),
 Token(" giant"),
 Token(","),
 Token(" fire"),
 Token("-bre"),
 Token("athing"),
 Token(","),
 Token(" city"),
 Token("-st"),
 Token("om"),
 Token("ping"),
 Token(" rept"),
 Token("ile"),
 Token(" from"),
 Token(" Japanese"),
 Token(" legend"),
 Token("."),
 Token(" Cre

In [69]:
inspector.top(5)

FeatureActivations(
   0: (Feature("Descriptions of physically imposing or giant characters"), 22.927734375)
   1: (Feature("Descriptions of antagonistic forces terrorizing others in narrative contexts"), 22.474609375)
   2: (Feature("Formal character descriptions listing multiple traits and attributes"), 18.1015625)
   3: (Feature("Movie titles and their metadata in catalogs and reviews"), 17.107421875)
   4: (Feature("Legal and procedural language describing official requirements"), 10.482421875)
)

In [86]:
neighbors =  client.features.search(
    message_text[0:99],
    model=variant,
    top_k=5
)

# Get indices
idx = []
for neighbor in neighbors:
    idx.append(neighbor.index_in_sae)


look = client.features.lookup(indices=idx, model=variant)

    

In [88]:
for neighbor in neighbors:
    print(neighbor)

Feature("Giant robot (mecha) anime battle descriptions and explanations")
Feature("Descriptions of physically imposing or giant characters")
Feature("Fictional monsters and creatures across languages")
Feature("Entities that command both fear and respect")
Feature("Grandiose claims of power or authority, especially in roleplay contexts")


In [90]:
look

{17962: Feature("Entities that command both fear and respect"),
 39030: Feature("Descriptions of physically imposing or giant characters"),
 30942: Feature("Fictional monsters and creatures across languages"),
 21417: Feature("Grandiose claims of power or authority, especially in roleplay contexts"),
 12636: Feature("Giant robot (mecha) anime battle descriptions and explanations")}

In [97]:
shai = act[:,idx]

In [102]:
np.sum(shai, axis=0)

array([ 1.88085938, 22.92773438,  6.86328125,  0.89453125,  0.        ])