Demonstration: stacking multiple **linear** layers (no activations) 
is exactly equivalent to a single affine (linear + bias) transformation.

In [11]:
import numpy as np

In [12]:
rng = np.random.default_rng(42)

In [13]:
# Dimensions
n_samples = 8      # batch size
d_in = 5           # input dimension
hidden = [7, 6, 4] # three "hidden" linear layers (no activations)
d_out = 3          # output dimension

In [14]:
# Generate dummy input data
X = rng.normal(size=(n_samples, d_in))

In [15]:
# Build random linear layers (weights and biases)
def random_layer(in_dim, out_dim, scale=0.5):
    W = rng.normal(scale=scale, size=(in_dim, out_dim))
    b = rng.normal(scale=scale, size=(out_dim,))
    return W, b

In [16]:
dims = [d_in] + hidden + [d_out]
Ws, bs = [], []
for i in range(len(dims)-1):
    W, b = random_layer(dims[i], dims[i+1])
    Ws.append(W)
    bs.append(b)

In [17]:
# Forward pass through the stack of linear layers (no activations)
def forward_stack(X, Ws, bs):
    H = X
    for W, b in zip(Ws, bs):
        H = H @ W + b  # affine map
    return H

In [18]:
Y_stack = forward_stack(X, Ws, bs)

In [19]:
# Collapse the stack into a single affine map: Y = X @ W_eff + b_eff
def collapse_affine(Ws, bs):
    W_eff = Ws[0].copy()
    b_eff = bs[0].copy()
    for W, b in zip(Ws[1:], bs[1:]):
        # After previous layers we have: H = X @ W_eff + b_eff
        # Next layer outputs: H @ W + b = (X @ W_eff + b_eff) @ W + b
        #                   = X @ (W_eff @ W) + (b_eff @ W + b)
        b_eff = b_eff @ W + b
        W_eff = W_eff @ W
    return W_eff, b_eff

In [20]:
W_eff, b_eff = collapse_affine(Ws, bs)
Y_eff = X @ W_eff + b_eff

In [21]:
# Compare results
max_abs_diff = np.max(np.abs(Y_stack - Y_eff))

In [22]:
print("Shapes:")
print(f"  X:      {X.shape}")
for i, (W, b) in enumerate(zip(Ws, bs), 1):
    print(f"  Layer {i}  W: {W.shape},  b: {b.shape}")
print(f"  Collapsed W_eff: {W_eff.shape}, b_eff: {b_eff.shape}\n")

Shapes:
  X:      (8, 5)
  Layer 1  W: (5, 7),  b: (7,)
  Layer 2  W: (7, 6),  b: (6,)
  Layer 3  W: (6, 4),  b: (4,)
  Layer 4  W: (4, 3),  b: (3,)
  Collapsed W_eff: (5, 3), b_eff: (3,)



In [23]:
print("First 3 rows of Y from stacked linear layers:\n", Y_stack[:3])
print("\nFirst 3 rows of Y from single affine map:\n", Y_eff[:3])
print(f"\nMax absolute difference between the two outputs: {max_abs_diff:.3e}")

First 3 rows of Y from stacked linear layers:
 [[-1.69690669  2.33362122 -2.61461271]
 [-1.26940596  1.42219875 -2.00695106]
 [-0.79445137  1.42375524 -1.06462945]]

First 3 rows of Y from single affine map:
 [[-1.69690669  2.33362122 -2.61461271]
 [-1.26940596  1.42219875 -2.00695106]
 [-0.79445137  1.42375524 -1.06462945]]

Max absolute difference between the two outputs: 6.661e-16


In [24]:
# Sanity check (should be ~ 1e-15 to 1e-12 due to floating point)
assert np.allclose(Y_stack, Y_eff, atol=1e-10), "Mismatch! The equivalence failed."

In [25]:
print("\nConclusion: Multiple linear layers without activations are exactly one affine map.")


Conclusion: Multiple linear layers without activations are exactly one affine map.
