In [1]:
import numpy as np
import torch
from transformers import AutoModelForCausalLM

from gptomics import gptneo

# Automatically rounding outputs to 4 digits
np.set_printoptions(precision=4)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-neo-125M"
)

In [4]:
a = np.tril(np.ones((3, 3)))
b = np.array((1, 2, 3))

In [5]:
a @ b

array([1., 3., 6.])

In [6]:
a

array([[1., 0., 0.],
       [1., 1., 0.],
       [1., 1., 1.]])

In [9]:
att0 = model.transformer.h[0].attn.attention

In [22]:
dv = torch.ones((1, 1, 768))

In [23]:
att0.q_proj(dv)[..., :5]

tensor([[[-0.1969,  0.8568,  2.1782,  6.3826,  6.7941]]],
       grad_fn=<SliceBackward0>)

In [19]:
Q = att0.q_proj.weight.data.numpy()

In [33]:
Q

array([[-0.3672, -0.165 , -0.168 , ...,  0.1807, -0.1206,  0.3535],
       [-0.1338, -0.1035, -0.0302, ..., -0.3633,  0.1514,  0.0171],
       [-0.0918,  0.0586,  0.3477, ...,  0.0938,  0.1226, -0.0103],
       ...,
       [ 0.6289, -0.0247, -0.209 , ..., -0.3496, -0.0153, -0.0674],
       [ 0.0542,  0.416 ,  0.2002, ...,  0.3223,  0.104 , -0.2852],
       [-0.3848, -0.0825, -0.3398, ...,  0.0432, -0.1436, -0.1582]],
      dtype=float32)

In [7]:
def Qcomp(QK, OV):
    return frobnorm(QK.T @ OV) / (frobnorm(QK) * frobnorm(OV))

def Kcomp(QK, OV):
    return frobnorm(QK @ OV) / (frobnorm(QK) * frobnorm(OV))

def Vcomp(OVin, OVout):
    return frobnorm(OVout @ OVin) / (frobnorm(OVout) * frobnorm(OVin))

### Q-composition

$(Q^TK)^T(OV)$

$K^TQOV$

$K^T(QO)V$

Matching up the rows of Q with the columns of O

## K-composition

$(W_{QK})(W_{OV})$


$(Q^TK)(OV)$

$Q^T(KO)V$

## V-composition

$(O_{\textrm{post}}V_{\textrm{post}})(O_{\textrm{pre}}V_{\textrm{pre}})$

$O_{\textrm{post}}(V_{\textrm{post}}O_{\textrm{pre}})V_{\textrm{pre}}$

# Simple layer norm

$(W_{QK})(W_{OV})/\sigma$


$f((W_{QK})(W_{OV})/\sigma) = f(W_{QK}W_{OV})/\sigma$


$\frac{f((W_{QK})(W_{OV})/\sigma)}{f(W_{QK})f(W_{OV}/\sigma)}$
