In [1]:
import numpy as np
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModelForCausalLM

In [3]:
model = AutoModelForCausalLM.from_pretrained(
    "EleutherAI/gpt-neo-125M")

In [4]:
type(model)

transformers.models.gpt_neo.modeling_gpt_neo.GPTNeoForCausalLM

In [5]:
att0 = model.transformer.h[0].attn.attention

In [6]:
att0

GPTNeoSelfAttention(
  (attn_dropout): Dropout(p=0, inplace=False)
  (resid_dropout): Dropout(p=0, inplace=False)
  (k_proj): Linear(in_features=768, out_features=768, bias=False)
  (v_proj): Linear(in_features=768, out_features=768, bias=False)
  (q_proj): Linear(in_features=768, out_features=768, bias=False)
  (out_proj): Linear(in_features=768, out_features=768, bias=True)
)

In [7]:
dummy_vectors = torch.ones(1, 768)

In [8]:
att0.k_proj(dummy_vectors).size()

torch.Size([1, 768])

In [9]:
K = att0.k_proj.weight.data.numpy()

In [10]:
(K @ np.ones((768, 1)))[:5]

array([[ 1.98204589],
       [ 5.55552197],
       [-3.39244461],
       [ 4.58187103],
       [ 0.16510725]])

In [11]:
(np.ones((768, 1)).T @ K.T).flatten()[:5]

array([ 1.98204589,  5.55552197, -3.39244461,  4.58187103,  0.16510725])

In [12]:
(np.ones((1, 768)) @ K).ravel()[:5]

array([  6.78366566,   5.0222044 , -18.74901867,  -1.27867794,
        -7.93630981])

In [13]:
att0.k_proj(dummy_vectors).ravel()[:5]

tensor([ 1.9820,  5.5555, -3.3924,  4.5819,  0.1651], grad_fn=<SliceBackward0>)

In [14]:
Q = att0.q_proj.weight.data.numpy()
V = att0.v_proj.weight.data.numpy()
O = att0.out_proj.weight.data.numpy()

In [15]:
Ob = att0.out_proj.bias.data.numpy()

In [16]:
k = att0.k_proj(dummy_vectors)
q = att0.q_proj(dummy_vectors)
v = att0.v_proj(dummy_vectors)

In [17]:
torch.matmul(q, k.transpose(-1, -2))

tensor([[2493.6262]], grad_fn=<MmBackward0>)

In [18]:
k.size(), q.size()

(torch.Size([1, 768]), torch.Size([1, 768]))

In [19]:
K.sum(1).T @ Q.sum(1)

2493.626

In [20]:
K.sum(0).T @ Q.sum(0)

39180.973

In [21]:
(Q.T @ K).sum()

2493.6265

In [22]:
QK = Q.T @ K

In [23]:
QK.shape

(768, 768)

# ^ Extracting $W_{QK}$

This is for ALL heads at once though, so not the most useful...

In [24]:
def headQK(Q, K, i):
    Qh = Q[i*64:(i+1)*64, :]
    Kh = K[i*64:(i+1)*64, :]
    
    if Qh.size == 0:
        raise ValueError(f"head {i} doesn't exist")
    
    return Qh.T @ Kh

In [25]:
(K @ np.ones((768, 1)))[:5]

array([[ 1.98204589],
       [ 5.55552197],
       [-3.39244461],
       [ 4.58187103],
       [ 0.16510725]])

In [26]:
headQK(Q, K, 0).shape

(768, 768)

In [27]:
headQK(Q, K, 11).shape

(768, 768)

In [28]:
try:
    headQK(Q, K, 12).shape
except ValueError as e:
    print("this works")

this works


cool

# v Extracting $W_{OV}$

In [29]:
v.shape

torch.Size([1, 768])

### Trying to reproduce the full attention result with more vectors

In [30]:
torch.manual_seed(32789)
np.random.seed(433485)

In [31]:
dummy_vectors = torch.rand(1, 5, 768)

In [32]:
dummy_vectors[:, :, :5]

tensor([[[0.4426, 0.3850, 0.6476, 0.4080, 0.6082],
         [0.0715, 0.6252, 0.6434, 0.7473, 0.2203],
         [0.1720, 0.8040, 0.5850, 0.8090, 0.6764],
         [0.3968, 0.2926, 0.8964, 0.9616, 0.1189],
         [0.0932, 0.1908, 0.2857, 0.6376, 0.1506]]])

In [33]:
q = att0.q_proj(dummy_vectors)
k = att0.k_proj(dummy_vectors)
v = att0.v_proj(dummy_vectors)

In [34]:
q.shape

torch.Size([1, 5, 768])

In [35]:
a = [1, 2, 3, 4, 5]
a[-1]

5

# Here's where we define $a_o$ and $a_w$

In [36]:
ao, aw = att0._attn(q[..., :64], k[..., :64], v[..., :64])

In [37]:
ao

tensor([[[[ 2.6369e+00, -8.6822e-01, -9.5641e-01,  2.4312e+00,  3.9608e+00,
            3.5235e+00,  1.0463e+00,  5.3936e+00,  2.6734e+00, -3.2134e+00,
           -3.3753e+00,  3.7295e+00, -1.5161e+00, -1.3766e+01,  1.3192e+00,
            3.2057e+00, -5.9184e-01,  1.4298e+01,  3.6049e-01,  2.0203e+00,
            1.0576e+00, -1.3702e+00,  5.1527e-01,  2.1878e-01,  2.6023e+00,
           -1.6464e+00, -3.9201e+00,  2.2758e+00, -7.7673e-01,  6.6421e-01,
            3.1096e+00,  5.0760e+00,  2.9606e+00, -7.2751e+00,  1.4117e+00,
            1.4039e+00,  1.2262e-02, -2.3922e+00,  1.2904e+00,  4.0702e-01,
            2.3376e+00, -8.8123e-01,  6.1847e+00, -5.3021e+00, -1.1477e+00,
           -1.1825e+00, -7.8987e+00,  1.6894e-02, -4.6824e+00, -2.2439e+00,
           -1.9134e+00,  4.0217e+00, -1.2610e+00,  2.9178e+00,  2.7388e-01,
            1.1905e+00, -6.1900e-01,  1.2203e+00, -1.1376e-01, -4.4278e+00,
            7.8752e-01, -1.7881e+00, -7.6964e-01, -1.7577e+00],
          [ 2.3070e-01, 

In [38]:
aw

tensor([[[[1., 0., 0., 0., 0.],
          [0., 1., 0., 0., 0.],
          [0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.],
          [0., 0., 0., 0., 1.]]]], grad_fn=<SoftmaxBackward0>)

In [39]:
result = att0(dummy_vectors)[0]

In [40]:
result

tensor([[[ -7.4073, -12.8660,  13.9407,  ...,  -3.7682, -18.6677,   2.4210],
         [  2.4488, -27.5220,  14.8699,  ...,  -3.7165, -42.1507, -21.8148],
         [ -0.0990,  18.7272,  23.4550,  ..., -16.8254, -28.2660,  -1.1647],
         [-23.3457,   0.8343,  24.8481,  ..., -28.3942, -32.2080,   5.3194],
         [ -6.7265,   0.2108,  -7.9686,  ..., -16.8513, -24.6491, -18.2180]]],
       grad_fn=<AddBackward0>)

#### Attention weights given $W_{QK}$

In [41]:
dvs = dummy_vectors.numpy()

In [42]:
v0 = dvs[0, 0, :].ravel()
qk_00 = v0.T @ headQK(Q, K, 0) @ v0

In [43]:
qk_00

39.80802

In [44]:
dvs.shape

(1, 5, 768)

In [45]:
dvs.T.shape

(768, 5, 1)

In [46]:
V.shape

(768, 768)

In [47]:
vs = (V @ dvs.transpose((2, 1, 0))[..., 0])

In [48]:
vs.shape

(768, 5)

In [49]:
vs_h0 = vs[:64, :]

In [50]:
qks = (dvs[0, ...] @ headQK(Q, K, 0) @ dvs[0, ...].T).T

In [51]:
qks

array([[  39.808083 ,  -14.71413  ,   20.835667 ,   -7.58498  ,
         -34.04499  ],
       [ -28.444506 ,  190.07964  ,  -16.684322 ,   41.232113 ,
           6.670292 ],
       [-180.83084  ,  -70.58137  ,  152.18175  , -133.6292   ,
        -149.92896  ],
       [ -17.040405 ,   -1.97789  ,  -36.64063  ,  233.8287   ,
          46.086235 ],
       [ -50.07481  ,    9.7430725,    1.6481285,  -39.96372  ,
         212.09746  ]], dtype=float32)

In [52]:
causal_mask = att0.bias[0, 0, :5, :5].numpy()

In [53]:
causal_mask

array([[1, 0, 0, 0, 0],
       [1, 1, 0, 0, 0],
       [1, 1, 1, 0, 0],
       [1, 1, 1, 1, 0],
       [1, 1, 1, 1, 1]], dtype=uint8)

In [54]:
my_att_weights_ = np.where(causal_mask == 1, qks, -1e9)

In [55]:
my_att_weights = torch.nn.functional.softmax(torch.tensor(my_att_weights_), dim=-1)
my_att_weights

tensor([[1., 0., 0., 0., 0.],
        [0., 1., 0., 0., 0.],
        [0., 0., 1., 0., 0.],
        [0., 0., 0., 1., 0.],
        [0., 0., 0., 0., 1.]])

In [56]:
v

tensor([[[ 2.6369, -0.8682, -0.9564,  ..., -0.0787, -0.0332, -0.6798],
         [ 0.2307, -1.1227, -2.9872,  ..., -2.6758, -0.8860, -0.7865],
         [-0.1502, -3.2117,  2.3664,  ...,  0.3992,  1.7001, -3.5451],
         [ 3.1610, -2.0758, -1.8201,  ..., -2.1901,  2.8392, -4.3925],
         [-0.6735, -3.9545, -1.0367,  ..., -1.8553,  1.9540, -1.6800]]],
       grad_fn=<UnsafeViewBackward0>)

In [57]:
v[..., :64][0, :5, :5]

tensor([[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
        [ 0.2307, -1.1227, -2.9872,  5.1197,  3.9424],
        [-0.1502, -3.2117,  2.3664,  3.6970,  2.7854],
        [ 3.1610, -2.0758, -1.8201,  2.9269,  2.0704],
        [-0.6735, -3.9545, -1.0367,  1.0544,  1.2172]],
       grad_fn=<SliceBackward0>)

In [58]:
ao[0, 0, :5, :5]

tensor([[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
        [ 0.2307, -1.1227, -2.9872,  5.1197,  3.9424],
        [-0.1502, -3.2117,  2.3664,  3.6970,  2.7854],
        [ 3.1610, -2.0758, -1.8201,  2.9269,  2.0704],
        [-0.6735, -3.9545, -1.0367,  1.0544,  1.2172]],
       grad_fn=<SliceBackward0>)

This was a trivial test, the attention weights were the identity matrix.

In [59]:
dvs2 = torch.tensor(np.arange(5*768).reshape(1, 5, 768).astype(np.float32))

In [60]:
q = att0.q_proj(dvs2)[:64]
k = att0.k_proj(dvs2)[:64]
v = att0.v_proj(dvs2)[:64]

In [61]:
ao, aw = att0._attn(q, k, v)

In [62]:
ao[0, 0, :5, :5]

tensor([[  -870.5228,  -1384.9580,   -794.9573,   1771.1262,   3062.9673],
        [ -1461.8575,  -4491.4492,  -2570.6619,   5836.2285,   8555.7871],
        [ -2053.1919,  -7597.9360,  -4346.3662,   9901.3301,  14048.6094],
        [ -2644.5251, -10704.4316,  -6122.0713,  13966.4355,  19541.4336],
        [ -3235.8691, -13810.9219,  -7897.7725,  18031.5371,  25034.2539]],
       grad_fn=<SliceBackward0>)

In [63]:
aw

tensor([[[[1., 0., 0., 0., 0.],
          [0., 1., 0., 0., 0.],
          [0., 0., 1., 0., 0.],
          [0., 0., 0., 1., 0.],
          [0., 0., 0., 0., 1.]]]], grad_fn=<SoftmaxBackward0>)

The above dummy vectors are too dissimilar. I'll try generating others by anchoring off of a "base" vector.

In [64]:
np.random.seed(897)
#dvs = dvs.detach().numpy()
base_dv = dvs[0, 0, :]

dvs[0, 1:, :] = np.random.randn(4, 768)*0.05 + base_dv

In [65]:
dvs = torch.tensor(dvs)

In [66]:
q = att0.q_proj(dvs)[..., :64]
k = att0.k_proj(dvs)[..., :64]
v = att0.v_proj(dvs)[..., :64]

In [67]:
ao, aw = att0._attn(q, k, v)

In [68]:
ao[0, 0, :, :5]

tensor([[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
        [ 2.8016, -0.9862, -0.8158,  2.1715,  3.7854],
        [ 2.4419, -1.3797, -1.3612,  2.1417,  4.4184],
        [ 2.7043, -0.8400, -1.0832,  2.2001,  3.9844],
        [ 2.4163, -1.0646, -0.4790,  1.6161,  3.5275]],
       grad_fn=<SliceBackward0>)

In [69]:
aw

tensor([[[[1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [1.4832e-06, 1.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
          [5.8861e-06, 5.7454e-02, 9.4254e-01, 0.0000e+00, 0.0000e+00],
          [1.6501e-04, 3.2934e-01, 1.9300e-01, 4.7750e-01, 0.0000e+00],
          [1.0361e-06, 6.3796e-04, 1.5368e-03, 7.5749e-06, 9.9782e-01]]]],
       grad_fn=<SoftmaxBackward0>)

In [70]:
np.round(aw.detach().numpy(), 3)

array([[[[1.   , 0.   , 0.   , 0.   , 0.   ],
         [0.   , 1.   , 0.   , 0.   , 0.   ],
         [0.   , 0.057, 0.943, 0.   , 0.   ],
         [0.   , 0.329, 0.193, 0.477, 0.   ],
         [0.   , 0.001, 0.002, 0.   , 0.998]]]], dtype=float32)

In [71]:
aw.shape

torch.Size([1, 1, 5, 5])

In [72]:
torch.matmul(aw, v)[..., :5]

tensor([[[[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
          [ 2.8016, -0.9862, -0.8158,  2.1715,  3.7854],
          [ 2.4419, -1.3797, -1.3612,  2.1417,  4.4184],
          [ 2.7043, -0.8400, -1.0832,  2.2001,  3.9844],
          [ 2.4163, -1.0646, -0.4790,  1.6161,  3.5275]]]],
       grad_fn=<SliceBackward0>)

In [73]:
np.round(aw.detach().numpy() @ v.detach().numpy(), 4)[..., :5]

array([[[[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
         [ 2.8016, -0.9862, -0.8158,  2.1715,  3.7854],
         [ 2.4419, -1.3797, -1.3612,  2.1417,  4.4184],
         [ 2.7043, -0.84  , -1.0832,  2.2001,  3.9844],
         [ 2.4163, -1.0646, -0.479 ,  1.6161,  3.5275]]]], dtype=float32)

This settles how V is applied. We'll need to compute the full layer input to do the same for O

In [74]:
att0.num_heads, att0.head_dim

(12, 64)

In [75]:
# "full" projections
qf = att0.q_proj(dvs)
kf = att0.k_proj(dvs)
vf = att0.v_proj(dvs)

qf = att0._split_heads(qf, 12, 64)
kf = att0._split_heads(kf, 12, 64)
vf = att0._split_heads(vf, 12, 64)

aof, awf = att0._attn(qf, kf, vf)

ao_ = att0._merge_heads(aof, 12, 64)
of = att0.out_proj(ao_)

In [76]:
np.round(of.detach().numpy(), 4)

array([[[ -7.4073, -12.866 ,  13.9407, ...,  -3.7682, -18.6677,
           2.421 ],
        [ -7.2214, -10.7088,  12.5017, ...,  -5.3761, -15.8355,
          -0.8125],
        [ -8.3059, -15.2888,  13.3901, ...,  -3.6739, -16.7572,
           1.5735],
        [ -2.9528, -12.7843,  16.0387, ...,  -4.7691, -15.5552,
           0.2245],
        [ -6.9189, -12.1247,  15.26  , ...,  -3.9072, -15.6968,
          -0.1535]]], dtype=float32)

In [77]:
np.round(((O @ ao_.detach().numpy()[0, ...].T) + Ob[:, np.newaxis]).T, 4)

array([[ -7.4073, -12.866 ,  13.9407, ...,  -3.7682, -18.6677,   2.421 ],
       [ -7.2214, -10.7088,  12.5016, ...,  -5.3761, -15.8355,  -0.8125],
       [ -8.3059, -15.2888,  13.3901, ...,  -3.6739, -16.7572,   1.5735],
       [ -2.9528, -12.7843,  16.0387, ...,  -4.7691, -15.5552,   0.2245],
       [ -6.9189, -12.1247,  15.26  , ...,  -3.9072, -15.6968,  -0.1535]],
      dtype=float32)

In [78]:
# This should be the output for the first head
np.round(O[:, :64] @ ao_.detach().numpy()[0, :, :64].T, 4).T

array([[-1.5928,  2.6477, -0.7921, ..., -5.4147, -1.7955, -3.4176],
       [-1.4912,  3.1833, -0.3857, ..., -5.9054, -2.0518, -3.9084],
       [-1.9131,  2.4385, -0.5308, ..., -5.8337, -1.7603, -3.7268],
       [-1.498 ,  2.761 , -0.6012, ..., -5.5093, -1.8154, -3.7578],
       [-1.8538,  3.2266, -0.4811, ..., -5.1514, -1.6286, -3.2996]],
      dtype=float32)

In [79]:
aof[0, 0, :, :5]

tensor([[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
        [ 2.8016, -0.9862, -0.8158,  2.1715,  3.7854],
        [ 2.4419, -1.3797, -1.3612,  2.1417,  4.4184],
        [ 2.7043, -0.8400, -1.0832,  2.2001,  3.9844],
        [ 2.4163, -1.0646, -0.4790,  1.6161,  3.5275]],
       grad_fn=<SliceBackward0>)

In [80]:
ao_[..., :5]

tensor([[[ 2.6369, -0.8682, -0.9564,  2.4312,  3.9608],
         [ 2.8016, -0.9862, -0.8158,  2.1715,  3.7854],
         [ 2.4419, -1.3797, -1.3612,  2.1417,  4.4184],
         [ 2.7043, -0.8400, -1.0832,  2.2001,  3.9844],
         [ 2.4163, -1.0646, -0.4790,  1.6161,  3.5275]]],
       grad_fn=<SliceBackward0>)

In [81]:
O.shape, v.shape, aw.shape

((768, 768), torch.Size([1, 5, 64]), torch.Size([1, 1, 5, 5]))

In [82]:
O[:, :64] @ v[0].detach().numpy().T @ aw[0, 0].detach().numpy()

array([[-1.5930438 , -2.0400388 , -2.085928  , -0.6324649 , -1.849832  ],
       [ 2.6481528 ,  4.1852603 ,  2.765938  ,  1.2503333 ,  3.2208989 ],
       [-0.7922147 , -0.6721544 , -0.6589091 , -0.3699021 , -0.48001918],
       ...,
       [-5.4155846 , -7.925419  , -6.4878674 , -2.4384959 , -5.138584  ],
       [-1.7958397 , -2.7068279 , -1.9695266 , -0.80307204, -1.6246333 ],
       [-3.4182389 , -5.3330164 , -4.215799  , -1.7529831 , -3.2913306 ]],
      dtype=float32)

^ This is just transposed

In [88]:
np.round(aw[0, 0].detach().numpy() @ v[0].detach().numpy() @ O[:, :64].T, 4)

array([[-1.5928,  2.6477, -0.7921, ..., -5.4147, -1.7955, -3.4176],
       [-1.4912,  3.1833, -0.3857, ..., -5.9054, -2.0518, -3.9084],
       [-1.9131,  2.4385, -0.5308, ..., -5.8337, -1.7603, -3.7268],
       [-1.498 ,  2.761 , -0.6012, ..., -5.5093, -1.8154, -3.7578],
       [-1.8538,  3.2266, -0.4811, ..., -5.1514, -1.6286, -3.2996]],
      dtype=float32)

Just working $V$ (the parameters) back in instead of $v$ (projected values)

In [93]:
aw_ = aw[0, 0].detach().numpy()
dvs_ = dvs.detach().numpy()[0]

In [94]:
dvs_.shape

(5, 768)

In [105]:
np.round(aw_ @ dvs_ @ V[:64, :].T @ O[:, :64].T, 4)

array([[-1.5928,  2.6477, -0.7921, ..., -5.4147, -1.7955, -3.4176],
       [-1.4912,  3.1833, -0.3857, ..., -5.9054, -2.0518, -3.9084],
       [-1.9131,  2.4385, -0.5308, ..., -5.8337, -1.7603, -3.7268],
       [-1.498 ,  2.761 , -0.6012, ..., -5.5093, -1.8154, -3.7578],
       [-1.8538,  3.2266, -0.4811, ..., -5.1514, -1.6286, -3.2996]],
      dtype=float32)

:tada: If we define:  
$l$ = sequence length  
$d$ = hidden/embedding dimension  
`dvs_` = $x^T \in \mathbb{R}^{l \times d}$  
`O` = $O \in \mathbb{R}^{d \times d}$ = O parameters for a layer  
`V` = $V \in \mathbb{R}^{d \times d}$ = V parameters for a layer  
$O_i$ = $O$ parameters for head $i$
$V_i$ = $V$ parameters for head $i$
$A_i$ = attention matrix for head $i$  
  
then  
$O_0 = O[:, :64]$,  
$V_0 = V[:64, :]$,  
and the output for head 0 is given by  
$O[:, :64] V[:64, :] x A_0$

In [106]:
def headOV(O, V, i):
    Oh = O[:, i*64:(i+1)*64]
    Vh = V[i*64:(i+1)*64, :]
    
    if Oh.size == 0:
        raise ValueError(f"head {i} doesn't exist")
    
    return Oh @ Vh

In [109]:
np.round(headOV(O, V, 0) @ dvs_.T @ aw_.T, 4).T

array([[-1.5928,  2.6477, -0.7921, ..., -5.4147, -1.7955, -3.4176],
       [-1.4912,  3.1833, -0.3857, ..., -5.9054, -2.0518, -3.9084],
       [-1.9131,  2.4385, -0.5308, ..., -5.8337, -1.7603, -3.7268],
       [-1.498 ,  2.761 , -0.6012, ..., -5.5093, -1.8154, -3.7578],
       [-1.8538,  3.2266, -0.4811, ..., -5.1514, -1.6286, -3.2996]],
      dtype=float32)

## Checking results for head 1

In [110]:
# This should be the output for the *second* head
np.round(O[:, 64:128] @ ao_.detach().numpy()[0, :, 64:128].T, 4).T

array([[ -4.4234, -12.9948,   2.612 , ...,   4.5413,  -5.7427,   4.2666],
       [ -4.4408, -12.6069,   2.401 , ...,   4.2663,  -5.6007,   4.042 ],
       [ -4.4266, -12.9238,   2.5734, ...,   4.4909,  -5.7167,   4.2255],
       [ -4.4299, -12.8492,   2.5328, ...,   4.4381,  -5.6894,   4.1823],
       [ -5.0052, -12.4906,   3.0329, ...,   3.6849,  -5.3873,   3.8877]],
      dtype=float32)

In [120]:
# attention weights given $W_{QK}$ above
aw1_ = dvs_ @ headQK(Q, K, 1) @ dvs_.T
aw1 = torch.nn.functional.softmax(
    # raw weights with causal mask
    torch.tensor(np.where(causal_mask == 1, aw1_, -1e9)),
    dim=-1,
).numpy()

In [122]:
np.round(headOV(O, V, 1) @ dvs_.T @ aw1.T, 4).T

array([[ -4.4234, -12.9948,   2.612 , ...,   4.5413,  -5.7427,   4.2666],
       [ -4.4408, -12.6069,   2.401 , ...,   4.2663,  -5.6007,   4.042 ],
       [ -4.4266, -12.9238,   2.5734, ...,   4.491 ,  -5.7167,   4.2255],
       [ -4.4299, -12.8492,   2.5329, ...,   4.4381,  -5.6894,   4.1823],
       [ -5.0052, -12.4906,   3.0329, ...,   3.6849,  -5.3873,   3.8877]],
      dtype=float32)