## Numpy2AD
### Generating adjoint code for Numpy matrix expressions

In [None]:
import ast
import numpy as np
from numpy2ad import draw_AST, transform, transform_expr

### 1st Example: D = AB + C (single-expression mode).

In [None]:
# Dump the raw abstract syntax tree
print(ast.dump(ast.parse("D = A @ B + C"), indent=3))

In [None]:
# Dot graph for AST
draw_AST("D = A @ B + C")

In [None]:
print(transform_expr("Y = A @ B + C"))

The user is responsible for importing numpy and correctly seeding adjoints.

In [None]:
dims = (3, 3)
A = np.ones(dims); A_a = np.zeros(dims) # dYdA
B = np.full(dims, 2.0); B_a = np.zeros(dims) # dYdB
C = np.full(dims, 3.0); C_a = np.zeros(dims) # dYdC
Y_a = np.zeros(dims); Y_a[0, 0] = 1 # adjoint seed

# adjoint code
v0 = A @ B
Y = v0 + C
v0_a = np.zeros(v0.shape)
C_a += Y_a
v0_a += Y_a
B_a += A.T @ v0_a
A_a += v0_a @ B.T

In [None]:
print(f"Y =\n {Y}\ndY/dA = \n {A_a}\ndY/dB =\n {B_a}\ndY/dC =\n {C_a}")

In [None]:
# AST of transformed expression (interesting for higher order derivatives)
draw_AST(transform_expr("D = A @ B + C"))

### Transforming functions:

In [None]:
# First test function
def simple_expr(A, B, C):
    return A @ B + C

In [None]:
# Transform the function code
simple_expr_transformed = transform(simple_expr)
print(simple_expr_transformed)

### Explanation:
- The function name is extended with "_ad" and adjoints (context) of function arguments ("A_a", "B_a", ...) and output (Y_a) are appended.
- The original function is completely transformed to single-assignment code.
- No overwriting/incremental assignments of variables is considered for now. Hence no "tbr_stack" etc.
- Only functions that return a single (ndarray) variable are considered for now.
- Adjoints of all intermediate v_i are zero-initialized.
- All arithmetic is overloaded to element-wise operations. Only the @ operator maps to matrix products.
- The primal result is returned together with all adjoints in a tuple.

### Test:

In [None]:
import numpy
# Compile and execute it to make it visible
exec(compile(simple_expr_transformed, filename="<ast>", mode="exec"))

# active arguments
dims = (3,3)
A = 1.5 * np.ones(dims)
B = 2.0 * np.ones(dims)
C = 3.0 * np.ones(dims)
# initialize adjoints
A_a = np.zeros(dims)
B_a = np.zeros(dims)
C_a = np.zeros(dims)
Y_a = np.zeros(dims)
# seed the adjoint direction
Y_a[0][0] = 1

result, dfdA, dfdB, dfdC = simple_expr_ad(A, B, C, A_a, B_a, C_a, Y_a)
print("primal result:\n {r}\n dy/dA:\n {a}\n dy/dB:\n {b}\n dy/dC:\n {c}".format(r=result, a=dfdA, b=dfdB, c=dfdC))

Each entry in the output matrix directly depends on the entries in C (element-wise addition) and on the corresponding row of A and column of B (dot product).

## 2nd Example: Element-wise Sigmoid Function 
### $\sigma(X) = \frac{1}{(1 + exp(-X))}$

It's a nice example to show but not the intended use case of this package as all derivatives are scalar...

In [None]:
def sigmoid(A: numpy.ndarray) -> numpy.ndarray:
    denominator = numpy.ones(A.shape) + numpy.exp(-A)
    return numpy.divide(numpy.ones(A.shape), denominator)

In [None]:
draw_AST(sigmoid)

'transform' already supports derivatives of some numpy functions and more can be added easily. 

In [None]:
sigmoid_transformed = transform(sigmoid)
print(sigmoid_transformed)

## Test:

In [None]:
dims = (3,3)
X = np.zeros(dims)
X_a = np.zeros(dims)
Y_a = np.ones(dims) # we can seed all adjoints at once
exec(compile(sigmoid_transformed, filename="<ast>", mode="exec"))

s_X, s_dX = sigmoid_ad(X, X_a, Y_a)
# Note that d/dx sigmoid(x) = (1-sigmoid(x))*sigmoid(x)
print("primal result:\n {s}\n df/dX:\n {ds}".format(s=s_X, ds=s_dX))

In [None]:
# Plot sigmoid and its derivative 
import matplotlib.pyplot as plt
num = 50
x_vec = np.linspace(-10, 10, num=num)
x_plot = np.zeros(num)
x_a_vec = np.zeros(num)
x_a_plot = np.zeros(num)
y_a = np.ones(num)

x_plot, x_a_plot = sigmoid_ad(x_vec, x_a_vec, y_a)
plt.plot(x_vec, x_plot, label="$\sigma(x)$")
plt.plot(x_vec, x_a_plot, "--", label="$\partial_x \,\sigma(x)$")
plt.legend();

## 3rd Example: Generalized Least Squares
Goal: estimate regression coefficients $b$ from observations $y$ with design matrix $X$ and error covariance matrix $M$.

The derivatives will tells us how e.g. the coefficients depends on the observations (uncertain) or model parameters.

Linear Model: $y_i = \beta_0 + \beta_1 * x_i$ 

Error Covariance: $\sigma_i^2 = 0.1 + \alpha * x_i$ (heteroscedastic for $\alpha \neq 0$)

In [None]:
N = 100
K = 2

# Construct the covariance matrix 
def build_system(alpha):
    x = np.linspace(0, 1, N) # x from 0 to 1
    var = 0.01 + alpha * x # variance is proportional to x
    cov_mat = np.zeros((N, N))
    np.fill_diagonal(cov_mat, var) # non-singular, no cross-correlations
    samples = (x + np.random.multivariate_normal(mean=np.zeros(N), cov=cov_mat)).reshape((N, 1)) # sample the distribution
    return x, samples, cov_mat

x, samples, cov_mat = build_system(alpha=0.0)
plt.plot(x, samples, ".") 
plt.plot(x, x, "r--", label="y=x")
plt.xlabel("$x$"); plt.ylabel("$y(x)$")
plt.legend();

In [None]:
# Closed form solution
gls = "b = np.linalg.inv(X.T @ np.linalg.inv(M) @ X) @ X.T @ np.linalg.inv(M) @ y"
# print(transform_expr(gls))

In [None]:
def GLS_ad(X, M, y, X_a, M_a, y_a, b_a):
    v0 = X.T
    v1 = np.linalg.inv(M)
    v2 = v0 @ v1
    v3 = v2 @ X
    v4 = np.linalg.inv(v3)
    v5 = X.T
    v6 = v4 @ v5
    v7 = np.linalg.inv(M)
    v8 = v6 @ v7
    b = v8 @ y
    v0_a = X_a.T
    v1_a = numpy.zeros(v1.shape)
    v2_a = numpy.zeros(v2.shape)
    v3_a = numpy.zeros(v3.shape)
    v4_a = numpy.zeros(v4.shape)
    v5_a = X_a.T
    v6_a = numpy.zeros(v6.shape)
    v7_a = numpy.zeros(v7.shape)
    v8_a = numpy.zeros(v8.shape)
    y_a += v8.T @ b_a
    v8_a += b_a @ y.T
    v7_a += v6.T @ v8_a
    v6_a += v8_a @ v7.T
    M_a -= np.linalg.inv(M).T @ (v7_a @ np.linalg.inv(M).T)
    v5_a += v4.T @ v6_a
    v4_a += v6_a @ v5.T
    v3_a -= np.linalg.inv(v3).T @ (v4_a @ np.linalg.inv(v3).T)
    X_a += v2.T @ v3_a
    v2_a += v3_a @ X.T
    v1_a += v0.T @ v2_a
    v0_a += v2_a @ v1.T
    M_a -= np.linalg.inv(M).T @ (v1_a @ np.linalg.inv(M).T)
    return b, X_a, M_a, y_a

# Initialize
y = samples
y_a = np.zeros(samples.shape)
X = np.ones((N, K))
X[:, 1] = x
# X[:, 2] = x**2
X_a = np.zeros(X.shape)
M = cov_mat
M_a = np.zeros(M.shape)

b_a = np.zeros((K, 1))
b_a[1] = 1 # differentiate w.r.t slope parameter of regression

b, X_a, M_a, y_a = GLS_ad(X, M, y, X_a, M_a, y_a, b_a)
print(b) # should be [0, 1]

In [None]:
plt.figure(figsize=(5,5))
plt.plot(y_a, "."); # "How does b change w.r.t. samples y"
plt.xlabel("$i$"); plt.ylabel(r"$\nabla_y \, b_1(i)$")
plt.title("Gradient magnitude: " + f"{np.linalg.norm(y_a): .3f}");
# plt.savefig("b_a_const.png", dpi=300)

In [None]:
plt.figure(figsize=(5,5))
plt.plot(x, samples)
plt.plot(x, b[0] + b[1] * x, "r", label=f"$y={float(b[0]): .3f} + {float(b[1]): .3f}x$")
plt.xlabel("$x$")
plt.ylabel("$y$")
plt.legend();
# plt.savefig("const_fit.png", dpi=300)

### Make the variance non-const

In [None]:
x, samples, cov_mat = build_system(alpha=0.1)
plt.plot(x, samples, ".") 
plt.xlabel("$x$"); plt.ylabel("$y(x)$");

In [None]:
# Initialize
y = samples
y_a = np.zeros(samples.shape)
X = np.ones((N, K))
X[:, 1] = x
# X[:, 2] = x**2
X_a = np.zeros(X.shape)
M = cov_mat
M_a = np.zeros(M.shape)

b_a = np.zeros((K, 1))
b_a[1] = 1 # differentiate w.r.t slope parameter of regression

b, X_a, M_a, y_a = GLS_ad(X, M, y, X_a, M_a, y_a, b_a)
print(b) # should be [0, 1]

In [None]:
plt.figure(figsize=(5,5))
plt.plot(y_a); # "How does b change w.r.t. samples y"
plt.xlabel("$i$"); plt.ylabel(r"$\nabla_y \, b_1(i)$")
plt.title("Gradient magnitude: " + f"{np.linalg.norm(y_a): .3f}");
# plt.savefig("b_a_var.png", dpi=300)

In [None]:
plt.figure(figsize=(5,5))
plt.plot(x, samples)
plt.plot(x, b[0] + b[1] * x, "r", label=f"$y={float(b[0]): .3f} + {float(b[1]): .3f}x$")
plt.xlabel("$x$")
plt.ylabel("$y$")
plt.legend();
# plt.savefig("var_fit.png", dpi=300)

In [None]:
# db_1 / dX
# plt.plot(X_a[:, 0])
plt.plot(X_a[:, 1]);

In [None]:
# db_1 / dM_ii
plt.plot(M_a.diagonal());