In [1]:
import numpy as np

Generate toy dataset: random matrix $X$ of dim $N \times d$, where $N=800$ denotes number of tokens and $d = 100$ denotes dimensionality of embedding space.

In [2]:
N = 800
d = 100

np.random.seed(213)
X = np.random.randn(N, d)

We will calculate cosine correlation between two random vectors $U$ and $V$ as:
$$
\operatorname{Coco}(U,V) = \mathbb{E} \left[ \overline{U} \cdot \overline{V} \right] - \mathbb{E} \left[ \overline{U} \right] \cdot \mathbb{E} \left[ \overline{V} \right]
$$

where
$$
\overline{U} = \frac{U}{||U||}, \quad \overline{V} = \frac{V}{||V||}
$$

Step 1: compute and cache normalized unpooled embeddings:
$$
\overline{X}_i = \frac{X_i}{||X_i||}
$$

where $\overline{X}_i$ is normalized embedding of $i$-th token, $i=1, 2, \dots, N$


In [3]:
# Calculate norms
norms = np.linalg.norm(X, axis=1, keepdims=True) # shape (N, 1)

# Avoid division by zero
norms[norms == 0] = 1.0

# Normalize embeddings
X_normalized = X / norms # shape (N, d)

Step 2: compute and cache prefix sums:
$$
S_{k} = \sum_{i=1}^{k-1} \overline{X}_{i}
$$

for $k = 0, 1, \dots, N$. Each prefix sum $S_k$ is a $d$-dimensional vector.

In [4]:
S = np.zeros((N+1, d))
S[1:] = np.cumsum(X_normalized, axis=0) # shape (N+1, d)

Fix lag $l = 5.$

In [5]:
l = 5
length = N - l

Step 3: Calculate 
$$
\mathbb{E} \left[ \overline{U} \right] = \frac{S_{N-l} - S_0}{N - l} = \frac{1}{N - l} \sum_{i=1}^{N-l} \overline{X}_i
$$

and

$$
\mathbb{E} \left[ \overline{V} \right] = \frac{S_{N} - S_l}{N - l} = \frac{1}{N - l} \sum_{i=l}^{N} \overline{X}_i
$$

These expected values are $d$-dimensional vectors.

In [6]:
E_U = (S[N-l] - S[0]) / length # shape (d,1)
E_V = (S[N] - S[l]) / length # shape (d,1)

Calculate dot product $\mathbb{E} \left[ \overline{U} \right] \cdot \mathbb{E} \left[ \overline{V} \right]$

In [7]:
EU_EV = np.dot(E_U, E_V) # scalar

Step 4: Calculate
$$
\mathbb{E} \left[ \overline{U} \cdot \overline{V} \right] = \frac{1}{N-l} \sum_{i=1}^{N-l} \overline{X_i} \cdot \overline{X}_{i+l}
$$

In [8]:
U = X_normalized[:N-l] # shape (N-l, d)
V = X_normalized[l:] # shape (N-l, d)
dot_products = np.sum(U * V, axis=1) # shape (N-l, 1)
E_UV = np.mean(dot_products) # scalar

Step 5: Return Coco.

In [9]:
E_UV - EU_EV

-0.0018225311183616708