In [94]:
import torch
import numpy as np
from taker import Model

m = Model()

vecs = []
for i in range(13):
    vecs.append( np.load(f"/home/ubuntu/eloise/vectorLayer{i}.npy") )
vecs = np.array( vecs )
normed_vecs = np.zeros_like(vecs)
for i in range(len(vecs)):
    normed_vecs[i] = vecs[i] / np.linalg.norm(vecs[i])

- Loaded nickypro/tinyllama-15m
 - Registered 6 Attention Layers


In [41]:
import json

genres = ["code", "explanatory", "instructional", "narrative", "speech"]
human_texts = []
for genre in genres:
    with open(f"/home/ubuntu/eloise/humanEdited_{genre}.json") as f:
        human_texts.append( json.load( f ) )

In [97]:
vecs = torch.tensor(vecs, device=m.device, dtype=m.dtype)

layer_norms_typical = {genre: [[] for layer in range(13)] for genre in genres}

with torch.no_grad():
    all_genre_scores = []

    for split, genre in zip(human_texts, genres):
        genre_scores = np.array([0 for i in range(5)])
        for text in split:
            for subtext in text["split_text"]:
                res = m.get_residual_stream(subtext["text"]).mean(dim=1)

                for layer in [10]:
                    idx = torch.argmax( vecs[layer] @ res[layer:layer+1].T )
                    genre_scores[idx] += 1

                for layer in range(13):
                    layer_norms_typical[genre][layer].append(res[layer].norm())
        normed_genre_scores = genre_scores / np.sum(genre_scores)
        normed_genre_scores = [ f"{x*100:.2f}%" for x in normed_genre_scores ]
        print(f"{genre:15s}", normed_genre_scores)

        all_genre_scores.append(normed_genre_scores)

all_genre_scores = np.array(all_genre_scores)


  vecs = torch.tensor(vecs, device=m.device, dtype=m.dtype)


code            ['37.93%', '13.64%', '0.00%', '0.78%', '47.65%']
explanatory     ['9.76%', '87.64%', '0.00%', '0.98%', '1.63%']
instructional   ['96.13%', '2.81%', '0.00%', '0.38%', '0.68%']
narrative       ['17.55%', '61.60%', '1.88%', '17.55%', '1.41%']
speech          ['23.94%', '12.04%', '0.00%', '60.48%', '3.54%']


In [118]:
ids = np.argmax( all_genre_scores, axis=0)

id_to_genre = { idx: genres[_id] for idx, _id in enumerate(ids) }
genre_to_id = {v:k for k, v in id_to_genre.items()}

print(genre_to_id)
print(id_to_genre)

{'instructional': 0, 'explanatory': 1, 'narrative': 2, 'speech': 3, 'code': 4}
{0: 'instructional', 1: 'explanatory', 2: 'narrative', 3: 'speech', 4: 'code'}


In [86]:
layer_size = []

for layer in range(13):
    layer_genre = []
    for genre in genres:
        layer_genre.append( np.mean([ x.cpu() for x in layer_norms_typical[genre][layer] ]) )
        # print( f"{layer} {genre}:", np.mean([ x.cpu() for x in layer_norms_typical[genre][layer] ]) )
    layer_size.append(np.median(layer_genre))
    print(f"{layer}:", layer_size[-1])


0: 0.1528
1: 0.3005
2: 0.6875
3: 0.8174
4: 1.267
5: 1.551
6: 1.858
7: 2.74
8: 3.176
9: 4.543
10: 4.55
11: 6.73
12: 71.94


In [124]:
texts = [
    "Alice and Bob",
    "Once upon a time",
    "Bob needs to",
    "A huge dragon",
    "The sky was"
]

m.model.config.pad_token_id = m.model.config.eos_token_id
print(m.model.config.pad_token_id)

print(m.post_biases)


text_generations = []
for text in texts:
    generations = []
    text_generations.append(generations)
    attn_res = normed_vecs[1::2]

    # Clean generation baseline
    for layer in range(6):
        for thing in ["mlp_out", "attn_o"]:
            h = m.post_biases[thing][layer]
            params = h.state_dict()
            params["param"] = torch.zeros_like( params["param"] )
            h.load_state_dict(params)

    generations.append( "baseline       : " + "".join(m.generate(text, 20)) )

    # modified outputs
    for class_idx in range(5):
        for layer in [1,2,3,4]:
            h = m.post_biases["attn_o"][layer]
            params = h.state_dict()
            params["param"] = torch.tensor(attn_res[layer][class_idx]) * 1.0 * layer_size[1 + 2*layer]
            h.load_state_dict(params)

        generations.append( f"{id_to_genre[class_idx]:15s}: " + "".join(m.generate(text, 20)) )

    # for generation in generations:
    #     print(generation)

for texts in text_generations:
    for generation in texts:
        print(generation)

Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.


2
{'attn_o': NeuronFunctionList(
  (layers): ModuleList(
    (0-5): 6 x NeuronPostBias()
  )
), 'mlp_out': NeuronFunctionList(
  (layers): ModuleList(
    (0-5): 6 x NeuronPostBias()
  )
)}


Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:2 for o

baseline       : Alice and Bob were best friends. They did everything together, but one day Bob wanted to play with Alice's
instructional  : Alice and Bob are are are are are nice pencil is your paper."."
 yourself is reach up up
explanatory    : Alice and Bob were best friends.  of tough thies. fields of thick snow. snow for many other
narrative      : Alice and Bob went their distant forest one day. he had had the heart his heartel layelel and had
speech         : Alice and Bob!"!" I have a go!" I have a really!"!" I want really!" I can feel
code           : Alice and Bob were two-weed and walked to the park. They were both walking and walking. On the
baseline       : Once upon a time, there was a little boy named Timmy. Timmy loved to play outside, but it was
instructional  : Once upon a time, Tim is is clean clothes is dry. The table is not is clean your hands is not on
explanatory    : Once upon a time, there was a little car. the road that where far part of a part of the ground.
