In [55]:
import torch
import numpy as np
from transformers import AutoModelForSeq2SeqLM

# Load the model
model = AutoModelForSeq2SeqLM.from_pretrained("google/flan-t5-base")

# Get all named parameters (weights + biases)
state_dict = model.state_dict()

# Convert all to NumPy
weights_numpy = {name: param.detach().cpu().numpy() for name, param in state_dict.items()}

# Optional: Print all weight names
for name in weights_numpy:
    print(name, weights_numpy[name].shape)


shared.weight (32128, 768)
encoder.embed_tokens.weight (32128, 768)
encoder.block.0.layer.0.SelfAttention.q.weight (768, 768)
encoder.block.0.layer.0.SelfAttention.k.weight (768, 768)
encoder.block.0.layer.0.SelfAttention.v.weight (768, 768)
encoder.block.0.layer.0.SelfAttention.o.weight (768, 768)
encoder.block.0.layer.0.SelfAttention.relative_attention_bias.weight (32, 12)
encoder.block.0.layer.0.layer_norm.weight (768,)
encoder.block.0.layer.1.DenseReluDense.wi_0.weight (2048, 768)
encoder.block.0.layer.1.DenseReluDense.wi_1.weight (2048, 768)
encoder.block.0.layer.1.DenseReluDense.wo.weight (768, 2048)
encoder.block.0.layer.1.layer_norm.weight (768,)
encoder.block.1.layer.0.SelfAttention.q.weight (768, 768)
encoder.block.1.layer.0.SelfAttention.k.weight (768, 768)
encoder.block.1.layer.0.SelfAttention.v.weight (768, 768)
encoder.block.1.layer.0.SelfAttention.o.weight (768, 768)
encoder.block.1.layer.0.layer_norm.weight (768,)
encoder.block.1.layer.1.DenseReluDense.wi_0.weight (2048

In [56]:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained("google/flan-t5-base")

In [57]:
input_text = "Translate English to French: Hello, how are you?"
inputs = tokenizer(input_text, return_tensors="pt")
input_ids = inputs["input_ids"]  # shape: (1, seq_len)


print(input_ids.shape)
print(input_ids)

torch.Size([1, 13])
tensor([[30355,    15,  1566,    12,  2379,    10,  8774,     6,   149,    33,
            25,    58,     1]])


In [58]:
embedding_matrix = weights_numpy["shared.weight"]  # (vocab_size, d_model)  shared.weight->shared embedding matrix
x_embed = embedding_matrix[input_ids.numpy()]      # shape: (1, seq_len, d_model)

print(embedding_matrix, embedding_matrix.shape)
print(" ")
print(x_embed, x_embed.shape)

[[-4.08053398e-01  1.27280390e+00 -1.33916453e-01 ... -8.01376700e-01
   3.92395234e+00  9.71753988e-03]
 [ 1.58270750e+01  7.19118977e+00  1.51406145e+01 ...  5.45081472e+00
  -2.59278812e+01  1.14962749e+01]
 [-1.84467244e+00 -8.89065981e-01 -1.29316864e+01 ...  4.10892248e+00
   5.05712175e+00 -3.76946950e+00]
 ...
 [ 5.66406250e-01 -1.35937500e+00 -1.19531250e+00 ...  1.46484375e-01
  -5.07812500e-01 -1.21093750e+00]
 [-8.90625000e-01  8.16406250e-01  6.36718750e-01 ... -1.20117188e-01
   9.49218750e-01 -1.59375000e+00]
 [-8.32031250e-01  1.82812500e+00  1.37695312e-01 ... -2.10937500e-01
  -1.18750000e+00 -5.11718750e-01]] (32128, 768)
 
[[[ 6.88593197e+00  2.83577561e+00  4.36377525e+00 ...  1.21415033e+01
   -3.79337239e+00  4.87758589e+00]
  [-6.19857572e-04  5.20903826e+00  6.24792004e+00 ... -3.83192611e+00
    1.55317676e+00  7.09706831e+00]
  [ 1.99395204e+00 -2.70621324e+00  2.15984192e+01 ...  8.31336975e+00
    4.03384495e+00 -4.74461937e+00]
  ...
  [-3.58882689e+00  6.

Encoder Block — Self-Attention (Layer 0)

In [64]:
#encoder layer weights
layer = model.encoder.block[0]
ln1_weight = layer.layer[0].layer_norm.weight.detach().numpy()  # (768,)
ln2_weight = layer.layer[1].layer_norm.weight.detach().numpy()  # (768,)

In [67]:
#LayerNorm before attention (incase of t5)
def layer_norm(x, weight):
    mean = x.mean(-1, keepdims=True)
    var = x.var(-1, keepdims=True)
    return (x - mean) / np.sqrt(var + 1e-6) * weight
x_norm = layer_norm(x_embed, ln1_weight)   #ln1_weight->LayerNorm weight before attention
print("After LayerNorm (attn):", x_norm.shape)

After LayerNorm (attn): (1, 13, 768)


In [68]:
weight_keys = [
    ("q_w", "encoder.block.0.layer.0.SelfAttention.q.weight"),
    ("k_w", "encoder.block.0.layer.0.SelfAttention.k.weight"),
    ("v_w", "encoder.block.0.layer.0.SelfAttention.v.weight"),
    ("o_w", "encoder.block.0.layer.0.SelfAttention.o.weight")
]

for name, key in weight_keys:
    weight = weights_numpy[key]
    print(f"{name} ({key}): shape = {weight.shape}")
    print(weight, "\n")


q_w (encoder.block.0.layer.0.SelfAttention.q.weight): shape = (768, 768)
[[-0.01452527  0.13774897  0.00905681 ... -0.05845838 -0.03574539
  -0.01240443]
 [ 0.06178693  0.01419278 -0.01175551 ... -0.06018291 -0.05362125
  -0.01797134]
 [ 0.00519627 -0.0433369  -0.05081028 ...  0.07576472 -0.04193865
   0.09032436]
 ...
 [-0.0037946   0.00894625  0.03839661 ...  0.04620769  0.00765166
  -0.00443411]
 [-0.02524992 -0.03872993  0.00699381 ...  0.04815986  0.02321322
  -0.03827596]
 [ 0.00546945 -0.00330857 -0.0223488  ...  0.02904797 -0.03176966
  -0.01060294]] 

k_w (encoder.block.0.layer.0.SelfAttention.k.weight): shape = (768, 768)
[[ 7.34305196e-03  6.61422253e-01  2.25334123e-01 ... -6.03573620e-01
  -2.29642674e-01 -2.23470833e-02]
 [ 3.04094225e-01 -2.41966367e-01  2.33838186e-02 ... -5.28845072e-01
   3.73502541e-03 -5.75851500e-01]
 [ 5.12275934e-01 -3.55967283e-01 -6.63109601e-01 ...  1.66478440e-01
  -9.13110852e-01  9.27042961e-01]
 ...
 [ 3.80679280e-01  2.78371155e-01  1.006

In [69]:
q_w = weights_numpy["encoder.block.0.layer.0.SelfAttention.q.weight"]
k_w = weights_numpy["encoder.block.0.layer.0.SelfAttention.k.weight"]
v_w = weights_numpy["encoder.block.0.layer.0.SelfAttention.v.weight"]
o_w = weights_numpy["encoder.block.0.layer.0.SelfAttention.o.weight"]


In [70]:
print(q_w.shape)   # (d_model, d_model)

(768, 768)


calculating q,k and v

In [71]:
q = np.matmul(x_norm, q_w.T)  # (1, seq_len, d_model)
k = np.matmul(x_norm, k_w.T)
v = np.matmul(x_norm, v_w.T)

In [72]:
print(q.shape)

(1, 13, 768)


Splitting into heads

In [73]:
# Split into heads: (1, seq_len, num_heads, head_dim) → transpose → (1, num_heads, seq_len, head_dim)
def split_heads(x, num_heads=12):
    batch, seq_len, d_model = x.shape
    return x.reshape(batch, seq_len, num_heads, d_model // num_heads).transpose(0, 2, 1, 3)

n_heads = model.config.num_heads
d_model = model.config.d_model
print(n_heads,d_model)
qh, kh, vh = split_heads(q), split_heads(k), split_heads(v)  # shape: (1, 8, seq_len, 64)
print(qh)

12 768
[[[[-3.81109118e-01 -1.53292790e-01  6.01664893e-02 ... -8.16766769e-02
    -2.12847963e-01  1.95742786e-01]
   [ 1.18703805e-02  2.89509892e-01 -1.27281249e-01 ...  1.54057488e-01
    -9.44998711e-02  5.56298308e-02]
   [ 1.91411395e-02 -5.63554354e-02  2.01860726e-01 ... -1.39355078e-01
    -2.29740977e-01 -1.97009236e-01]
   ...
   [-1.99008837e-01  2.38384664e-01 -2.11748570e-01 ...  1.18108671e-02
    -2.47325543e-02  1.80457339e-01]
   [-1.73620343e-01  1.13829672e-01 -1.54227585e-01 ... -3.55760306e-02
    -6.77405968e-02  1.35670789e-02]
   [ 5.31724393e-01  3.33801389e-01 -2.88328350e-01 ...  1.35146141e-01
     2.12402388e-01  2.46586755e-01]]

  [[ 3.44557762e-02  1.12452686e-01 -1.57856569e-03 ...  1.27684832e-01
     1.11308917e-01 -2.46218406e-02]
   [-6.60367981e-02  1.30099639e-01  1.96978062e-01 ... -7.55843520e-02
     9.16574448e-02  4.99827787e-04]
   [ 1.12402678e-01  2.75640581e-02 -3.37384641e-04 ...  8.61378461e-02
     1.43078789e-01 -1.16522662e-01]
   

In [74]:
print(qh.shape)

(1, 12, 13, 64)


In [None]:
Understanding the output as a 4d matrix

qh = [
  [  # batch 0
    [  # head 0
      [0.1, 0.2, 0.3, 0.4],   # token 0 vector (length4)
      [0.5, 0.6, 0.7, 0.8],   # token 1 vector
      [0.9, 1.0, 1.1, 1.2],   # token 2 vector
    ],
    [  # head 1
      [1.1, 1.2, 1.3, 1.4],   # token 0 vector
      [1.5, 1.6, 1.7, 1.8],   # token 1 vector
      [1.9, 2.0, 2.1, 2.2],   # token 2 vector
    ]
  ]
]


In [75]:
import numpy as np

def softmax(x, axis=None):
    e_x = np.exp(x - np.max(x, axis=axis, keepdims=True))
    return e_x / e_x.sum(axis=axis, keepdims=True)


In [76]:
intermediate_scores=np.matmul(qh, kh.transpose(0,1,3,2))/np.sqrt(64)   #dk=dmodel/h

attention_weights=softmax(intermediate_scores, axis=-1)
context=np.matmul(attention_weights,vh)   ## shape: (1, 12, seq_len, 768/12=64)

print(context, context.shape)

[[[[-2.15860419e-01 -2.63770201e-01 -3.53238665e-02 ... -8.88691316e-03
     1.40825993e-01  3.43908950e-02]
   [-1.52835910e-01  7.10090714e-03  2.68882653e-02 ... -1.84767748e-02
     1.91802196e-02  5.47547546e-02]
   [-2.19141690e-01 -7.33091592e-02 -4.23792728e-02 ... -4.79674450e-02
     6.49535093e-02 -3.89163827e-02]
   ...
   [-1.39401007e-01  1.64586155e-02 -3.44427815e-02 ... -4.60245643e-02
    -2.74281160e-02 -1.67787249e-02]
   [-1.63442535e-01 -1.22903116e-02  8.10979404e-03 ...  1.10961323e-02
    -3.09907482e-02 -1.49625501e-03]
   [-1.46466427e-01 -1.29488365e-02 -4.96935594e-02 ... -7.01406105e-02
    -6.34402586e-02  1.26871507e-02]]

  [[-8.81407118e-02 -1.90159509e-01 -1.98428137e-02 ...  1.06672875e-01
     1.58228752e-01  3.20256440e-01]
   [-7.74717636e-02 -1.81456232e-01 -2.24622110e-02 ...  9.24443479e-02
     1.44092566e-01  3.07507770e-01]
   [-8.95914972e-02 -1.83051561e-01 -1.50604624e-02 ...  1.07630873e-01
     1.59088877e-01  3.17300166e-01]
   ...
   

In [77]:
print("qh shape:", qh.shape)
print("kh shape:", kh.shape)
print("vh shape:", vh.shape)
print("attention_weights shape:", attention_weights.shape)
print("context shape:", context.shape)


qh shape: (1, 12, 13, 64)
kh shape: (1, 12, 13, 64)
vh shape: (1, 12, 13, 64)
attention_weights shape: (1, 12, 13, 13)
context shape: (1, 12, 13, 64)


In [78]:
# Concatenate heads
context_concat = context.transpose(0, 2, 1, 3).reshape(1, -1, 768)

In [79]:
context_concat.shape

(1, 13, 768)

In [80]:
# Final attention projection
attn_output = np.matmul(context_concat, o_w.T)  # (1, seq_len, 768)

In [81]:
attn_output.shape

(1, 13, 768)

In [83]:
#Residual + LayerNorm before FFN
res1 = x_embed + attn_output
res1_norm = layer_norm(res1, ln2_weight)
print("After LayerNorm (FFN):", res1_norm.shape)



After LayerNorm (FFN): (1, 13, 768)


In [84]:
ff_layer = model.encoder.block[0].layer[1].DenseReluDense

wi_0 = ff_layer.wi_0.weight.detach().numpy()
wi_1 = ff_layer.wi_1.weight.detach().numpy()
wo    = ff_layer.wo.weight.detach().numpy()


In [85]:
wi_0.shape

(2048, 768)

In [86]:
wo.shape

(768, 2048)

In [88]:
#FFN(gated)
wi0_out = np.matmul(res1_norm, wi_0.T)
wi1_out = np.matmul(res1_norm, wi_1.T)
relu_out = np.maximum(0, wi1_out)
ffn_out = relu_out * wi0_out
ffn_proj = np.matmul(ffn_out, wo.T)
print("FFN output:", ffn_proj.shape)

FFN output: (1, 13, 768)


In [89]:
#Final encoder layer output
encoder_out = res1 + ffn_proj
print("Final encoder output shape:", encoder_out.shape)

Final encoder output shape: (1, 13, 768)


DECODER (ASSUMING GENERATED TOKEN IS "SOS BONJOUR COMMENT")

In [91]:
decoder_text = "<pad> bonjour comment"  # assuming <pad> or <s> token as SOS/start


In [94]:
# Tokenize decoder input text
decoder_inputs = tokenizer(decoder_text, return_tensors="pt")
decoder_input_ids = decoder_inputs.input_ids  # (1, seq_len)
print(decoder_input_ids.shape)
print(decoder_inputs)

torch.Size([1, 5])
{'input_ids': tensor([[    0,  2682, 18359,  1670,     1]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}


In [160]:
print(decoder_inputs.input_ids)
print([tokenizer.decode([tid]) for tid in decoder_inputs.input_ids[0]])


tensor([[    0,  2682, 18359,  1670,     1]])
['<pad>', 'bon', 'jour', 'comment', '</s>']


In [96]:
# Get decoder embedding weights from model
decoder_embedding_weights = model.decoder.embed_tokens.weight.detach().numpy()  # (vocab_size, d_model)
print(decoder_embedding_weights.shape)

(32128, 768)


In [97]:
# Convert input_ids to numpy
decoder_input_ids_np = decoder_input_ids.detach().numpy()

In [98]:
# Get embeddings for decoder input tokens
dec_embedded = decoder_embedding_weights[decoder_input_ids_np[0]]  # (seq_len, d_model)
dec_embedded = dec_embedded[np.newaxis, ...]  # Add batch dim: (1, seq_len, d_model)
print(dec_embedded.shape)

(1, 5, 768)


In [99]:
decoder_layer = model.decoder.block[0]

# LayerNorm weights
dec_ln1_weight = decoder_layer.layer[0].layer_norm.weight.detach().numpy()
dec_ln2_weight = decoder_layer.layer[1].layer_norm.weight.detach().numpy()
dec_ln3_weight = decoder_layer.layer[2].layer_norm.weight.detach().numpy()


# Self-attention weights
dec_self_q = decoder_layer.layer[0].SelfAttention.q.weight.detach().numpy()
dec_self_k = decoder_layer.layer[0].SelfAttention.k.weight.detach().numpy()
dec_self_v = decoder_layer.layer[0].SelfAttention.v.weight.detach().numpy()
dec_self_o = decoder_layer.layer[0].SelfAttention.o.weight.detach().numpy()

# Cross-attention weights
cross_attn_q = decoder_layer.layer[1].EncDecAttention.q.weight.detach().numpy()
cross_attn_k = decoder_layer.layer[1].EncDecAttention.k.weight.detach().numpy()
cross_attn_v = decoder_layer.layer[1].EncDecAttention.v.weight.detach().numpy()
cross_attn_o = decoder_layer.layer[1].EncDecAttention.o.weight.detach().numpy()

# Feedforward weights (gated)
wi0 = decoder_layer.layer[2].DenseReluDense.wi_0.weight.detach().numpy()
wi1 = decoder_layer.layer[2].DenseReluDense.wi_1.weight.detach().numpy()
wo = decoder_layer.layer[2].DenseReluDense.wo.weight.detach().numpy()


In [104]:
print("LayerNorm weights shapes:")
print("dec_ln1_weight:", dec_ln1_weight.shape)
print("dec_ln2_weight:", dec_ln2_weight.shape)
print("dec_ln3_weight:", dec_ln3_weight.shape)

print("\nSelf-attention weights shapes:")
print("dec_self_q:", dec_self_q.shape)
print("dec_self_k:", dec_self_k.shape)
print("dec_self_v:", dec_self_v.shape)
print("dec_self_o:", dec_self_o.shape)

print("\nCross-attention weights shapes:")
print("cross_attn_q:", cross_attn_q.shape)
print("cross_attn_k:", cross_attn_k.shape)
print("cross_attn_v:", cross_attn_v.shape)
print("cross_attn_o:", cross_attn_o.shape)

print("\nFeedforward weights (gated) shapes:")
print("wi0:", wi0.shape)
print("wi1:", wi1.shape)
print("wo:", wo.shape)


LayerNorm weights shapes:
dec_ln1_weight: (768,)
dec_ln2_weight: (768,)
dec_ln3_weight: (768,)

Self-attention weights shapes:
dec_self_q: (768, 768)
dec_self_k: (768, 768)
dec_self_v: (768, 768)
dec_self_o: (768, 768)

Cross-attention weights shapes:
cross_attn_q: (768, 768)
cross_attn_k: (768, 768)
cross_attn_v: (768, 768)
cross_attn_o: (768, 768)

Feedforward weights (gated) shapes:
wi0: (2048, 768)
wi1: (2048, 768)
wo: (768, 2048)


In [101]:
# LayerNorm before self-attention
dec_x_norm = layer_norm(dec_embedded, dec_ln1_weight)  # use same layer_norm function you have
print(dec_x_norm.shape)

(1, 5, 768)


In [113]:
# Linear projections for self-attention
Q = np.matmul(dec_x_norm, dec_self_q.T)
K = np.matmul(dec_x_norm, dec_self_k.T)
V = np.matmul(dec_x_norm, dec_self_v.T)
print(Q.shape)

(1, 5, 768)


In [115]:
Q, K, V = [split_heads(x, n_heads) for x in (Q, K, V)]

In [116]:
print(Q.shape)

(1, 12, 5, 64)


In [120]:
# Compute scaled dot-product attention scores
dk = Q.shape[-1]
scores = np.matmul(Q, K.transpose(0,1,3,2)) / np.sqrt(dk)
# Mask future tokens (causal mask)
seq_len = scores.shape[-1]
mask = np.triu(np.ones((seq_len, seq_len)), k=1).astype(bool)   #triu->upper triangle k=1=>true(where future tokens should be masked)
scores_masked = np.where(mask, -1e9, scores)    #all true is replaced with -1e9

print(scores_masked.shape)
attn_weights_dec=softmax(scores_masked)
print(attn_weights_dec.shape)

(1, 12, 5, 5)
(1, 12, 5, 5)


In [126]:
attn_output_dec=np.matmul(attn_weights_dec,V)
print(attn_output_dec.shape)

(1, 12, 5, 64)


In [127]:
attn_output_dec = attn_output_dec.transpose(0,2,1,3).reshape(dec_embedded.shape)
print(attn_output_dec.shape)

(1, 5, 768)


In [128]:
# Output projection
attn_out_proj_dec = np.matmul(attn_output_dec, dec_self_o.T)
print(attn_out_proj_dec.shape)

(1, 5, 768)


In [129]:
#Residual + LayerNorm after self-attention
res1 = dec_embedded + attn_out_proj_dec
res1_norm = layer_norm(res1, dec_ln2_weight)
print(res1.shape)

(1, 5, 768)


CROSS ATTENTION

In [135]:
# Linear projections for cross-attention (query from decoder residual norm)
Q_cross = np.matmul(res1_norm, cross_attn_q.T)
print(Q_cross.shape)

(1, 5, 768)


In [134]:
# Key, value from encoder output (already computed earlier)
# encoder_out shape: (batch_size, seq_len_enc, d_model)
K_cross = np.matmul(encoder_out, cross_attn_k.T)
V_cross = np.matmul(encoder_out, cross_attn_v.T)
print(K_cross.shape)
print(V_cross.shape)

(1, 13, 768)
(1, 13, 768)


In [136]:
# Reshape for multi-head
Q_cross = split_heads(Q_cross, n_heads)
K_cross = split_heads(K_cross, n_heads)
V_cross = split_heads(V_cross, n_heads)
print(Q_cross.shape,K_cross.shape)

(1, 12, 5, 64) (1, 12, 13, 64)


In [138]:
# Compute attention scores and softmax
scores_cross = np.matmul(Q_cross, K_cross.transpose(0,1,3,2)) / np.sqrt(dk)
print(scores_cross.shape)
attn_weights_cross = softmax(scores_cross)


(1, 12, 5, 13)


In [147]:
# Attention output
attn_output_cross = np.matmul(attn_weights_cross, V_cross)
print(attn_output_cross.shape)
# Reshape back and output projection
attn_output_cross = attn_output_cross.transpose(0,2,1,3).reshape(res1_norm.shape)
attn_out_proj_cross = np.matmul(attn_output_cross, cross_attn_o.T)
print(attn_out_proj_cross.shape)

(1, 12, 5, 64)
(1, 5, 768)


In [149]:
# (1, 12, 5, 64) -> (1, 5, 768)
attn_output_merged = attn_output_proj_cross.transpose(0, 2, 1, 3).reshape(1, 5, 768)
#Residual + LayerNorm after cross-attention
res2 = res1_norm + attn_output_merged
res2_norm = layer_norm(res2, dec_ln3_weight)
print(res2_norm.shape)

(1, 5, 768)


In [155]:
#FFN(GATED)
wi0_out_dec = np.matmul(res2_norm, wi0.T)
print(wi0_out_dec.shape)
wi1_out_dec = np.matmul(res2_norm, wi1.T)
print(wi1_out_dec.shape)
relu_out_dec = np.maximum(0, wi0_out_dec)
print(relu_out_dec.shape)
ffn_out_dec = relu_out_dec * wi1_out_dec
ffn_proj_dec = np.matmul(ffn_out_dec, wo.T)

print(ffn_proj_dec.shape)

# Final output of decoder layer 0
decoder_out = res2_norm + ffn_proj_dec
print("Decoder output shape:", decoder_out.shape)


(1, 5, 2048)
(1, 5, 2048)
(1, 5, 2048)
(1, 5, 768)
Decoder output shape: (1, 5, 768)


In [156]:
#Final LayerNorm
final_ln_weight = model.decoder.final_layer_norm.weight.detach().numpy()  # (768,)
decoder_out_norm = layer_norm(decoder_out, final_ln_weight)
print("Final normed decoder output:", decoder_out_norm.shape)  # (1, seq_len, 768)


Final normed decoder output: (1, 5, 768)


In [157]:
#Output projection to vocabulary/LINEAR LAYER AFTER DECODER
lm_head_weight = model.lm_head.weight.detach().numpy()  # (32128, 768)
print(lm_head_weight.shape)
logits = np.matmul(decoder_out_norm, lm_head_weight.T)  # (1, seq_len, vocab_size)
print("Logits shape:", logits.shape)


(32128, 768)
Logits shape: (1, 5, 32128)


In [158]:
#Pick the last token's logits and do argmax for predicting next token
last_logits = logits[0, -1]  # Shape: (vocab_size,)
next_token_id = np.argmax(last_logits)
print("Predicted token ID:", next_token_id)


Predicted token ID: 3003


In [159]:
predicted_token = tokenizer.decode([next_token_id])
print("Predicted next token:", predicted_token)


Predicted next token: poli
