In [1]:
from typing import Tuple, Optional, List
import math
import random
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F

In [4]:
SEED = 42
random.seed(SEED); np.random.seed(SEED); torch.manual_seed(SEED)
DEVICE = torch.device("mps")

In [5]:
DEVICE

device(type='mps')

# 1. Intuition to attention

## 1.1 What is attention

我有三份信息（Values, V）要综合，比如 V1, V2, V3；我手上还有一组权重（weights），比如 0.7, 0.2, 0.1（加起来=1）。
那综合结果就是：0.7*V1 + 0.2*V2 + 0.1*V3。
——这就是注意力结果（只不过真正的权重不是手填，而是“算出来”的）。

In [6]:
def weighted_average(values, weights):
    """
    Inputs:
    values: np.ndarray, shape [N, D] N informations, each is D-dimentional vector
    weights: np.ndarray, shape [N]

    Outputs:
    out: np.ndarray, shape[D],= weighted average
    """
    weights = weights / (weights.sum() + 1e-12) #normalization
    return (weights[:, None] * values).sum(axis=0)

V = np.array([[10., 0.],   # V1
              [ 0.,10.],   # V2
              [ 5., 5.]])  # V3   
w = np.array([0.7, 0.2, 0.1]) 
out = weighted_average(V, w)
print(out)

[7.5 2.5]


In [7]:
w @ V

array([7.5, 2.5])

## 1.2 Where is weight from?

想法：

* 我有一个“问题”向量 Q（Query），表示“我现在想要什么”。

* 还有一堆“候选”向量 K（Keys），每个候选都有对应的 V（Values） 信息。

* 打分：用 Q 去和每个 K 做“相似度”（我们用点积，越像分越高）。

* 归一化：把这些分数做 softmax（指数归一化）→ 得到 0~1 的权重，且和=1。

* 带权平均：用这些权重去加权 V，得到输出。

这就是常说的 Scaled Dot-Product Attention（缩放点积注意力）

In [9]:
def softmax(x):
    """
    inputs:
    x: np.ndarray, shape[N]

    output:
    p: np.ndarray, shape[N]

    how it works:
    p[i] = exp(x[i]) / sum(exp(x[j]))
    """
    x = x - x.max()
    e = np.exp(x)
    return e / (e.sum() + 1e-12)

In [10]:
def attention_once(Q, K, V):
    """
    inputs:
    Q: np.ndarary, shape[D] one query vector
    K: np.ndarray, shape[N, D] N keys
    V: np.ndarray, shape[N, Dv] every key corresponding values

    output:
    out: np.ndarray, shape [Dv] attention output
    weights: np.ndarray, shape [N] attention weights (sum == 1)

    how it works:
    1) scores[i] = dot(Q, K[i])
    2) weights = softmax(scores)
    3) out = sum(weights[i] * V[i]) over i
    """

    scores = K @ Q

    weights = softmax(scores)

    out = weighted_average(V, weights)

    return out, weights, scores

In [11]:
Q = np.array([1.0, 0.0])                    # I would like the information along x axis
K = np.array([[1.0, 0.0],                   # K1, same as Q, along with it
              [0.7, 0.2],                   # K2, sort of align with Q
              [-1.0, 0.0]])                 # K3, opposite direction of W
V = np.array([[10., 0.],                    # V1
              [ 0.,10.],                    # V2
              [ 5., 5.]], dtype=np.float32) # V3

out, w, s = attention_once(Q, K, V)
print("打分 scores :", s)       # 看哪个最相关（越大越相关）
print("权重 weights:", w)       # softmax 后变为概率分布（和=1）
print("输出 out    :", out)     # 带权平均的结果

打分 scores : [ 1.   0.7 -1. ]
权重 weights: [0.53300543 0.39486013 0.07213444]
输出 out    : [5.69072648 4.30927352]


need * 1/sqrt(D) if large

## 1.3 What is mask?

* Padding Mask（填充掩码）：补齐出来的 <pad> 位置是“无效的”，权重要变成 0。
做法：在分数上把这些位置加上一个超大负数（如 -1e9），softmax 后几乎就是 0。

* Causal Mask（因果掩码）：解码时不能看未来（当前位置只能看它前面），把“未来位置”也加上超大负数。

In [12]:
def apply_mask(scores, mask):
    """
    Inputs:
    scores: np.ndarray, shape [N], original score
    mask: np.ndarray, shape [N], dtype = bool, True means need to mask that position

    output:
    masked_score: np.ndarray, shape [N]

    how it works:
    make mask==True position extremely small, after softmax, it will very close to 0
    """

    masked = scores.copy()
    masked[mask] = -1e9
    return masked

In [13]:
scores = np.array([3.0, 1.0, -2.0], dtype=np.float32)
mask   = np.array([False, True, False])  # 第二个位置是 pad，要遮住
masked_scores = apply_mask(scores, mask)

In [14]:
print("原分数:", scores, "-> softmax", softmax(scores))
print("打掩码:", masked_scores, "-> softmax", softmax(masked_scores))  # 中间那个几乎 0

原分数: [ 3.  1. -2.] -> softmax [0.8756006  0.11849965 0.00589975]
打掩码: [ 3.e+00 -1.e+09 -2.e+00] -> softmax [0.9933072  0.         0.00669285]


## 1.4 Self-attention

关键：现在 Q/K/V 都来自同一句话的向量表示 X。
最简单的演示：我们先不做任何线性变换，直接令 Q=K=V=X（现实里会各自过一层线性投影，这里先别管）。

这样你能看到：每个词的位置会按相似度从其他词那里“取信息”（加权平均）。

In [15]:
def self_attention_minimal(X):
    """
    inputs:
    X: np.ndarray shape [T, D], one sequence of T vectors (each vector is D-dimensional)

    output:
    Y: np.ndarray shape [T, D], self attention output, every position i is a new representation from weight averaged all positions

    how it works:
    for every i:
        socres[i, j] = X[i] X[j]
        weights[i] = softmax(socres[i])
        Y[i] = sum(weights[i, j] * X[j]) over j
    """

    T, D = X.shape
    Y = np.zeros_like(X)
    for i in range(T):
        scores = X @ X[i]
        weights = softmax(scores)
        Y[i] = weighted_average(X, weights)
    return Y

In [16]:
np.random.seed(0)
X = np.random.randn(4, 3).astype(np.float32)  # T=4 词，D=3 维
Y = self_attention_minimal(X)
print("输入 X 形状:", X.shape)
print("输出 Y 形状:", Y.shape)

输入 X 形状: (4, 3)
输出 Y 形状: (4, 3)


In [17]:
X

array([[ 1.7640524 ,  0.4001572 ,  0.978738  ],
       [ 2.2408931 ,  1.867558  , -0.9772779 ],
       [ 0.95008844, -0.1513572 , -0.10321885],
       [ 0.41059852,  0.14404356,  1.4542735 ]], dtype=float32)

In [19]:
Y

array([[ 1.7975295 ,  0.85911125,  0.3104185 ],
       [ 2.238525  ,  1.8615676 , -0.9702689 ],
       [ 1.7368875 ,  0.9578309 , -0.05566131],
       [ 1.0923121 ,  0.30132565,  1.0670953 ]], dtype=float32)