In [1]:
import numpy as np
import torch
from torch.autograd import grad
from scipy.spatial.distance import pdist, squareform
from stein_mpc.kernels import GaussianKernel, ScaledGaussianKernel
from sklearn.metrics.pairwise import rbf_kernel

# from D. Wang and Q. Liu SVGD paper (https://github.com/dilinwang820/Stein-Variational-Gradient-Descent/blob/master/python/svgd.py)
def baseline(X, Y, h):
    gamma = 0.5 / h.numpy() ** 2
    Kxy = rbf_kernel(X, Y, gamma)

    dx_Kxy = -np.matmul(Kxy, X)
    sum_Kxy = np.sum(Kxy, axis=1)
    for i in range(X.shape[1]):
        dx_Kxy[:, i] = dx_Kxy[:, i] + np.multiply(X[:,i], sum_Kxy)
    dx_Kxy = dx_Kxy / (h**2)
    return Kxy, dx_Kxy

# from D. Wang and Q. Liu Matrix SVGD paper (https://github.com/dilinwang820/matrix_svgd/blob/master/2d_toy/code/kernel.py)
def scaled_baseline(X, Y, M, h):
    n,d = X.shape
    diff = X[:, None, :] - Y[None, :, :]
    Mdiff = torch.matmul(diff, M)
    K = torch.exp(-torch.sum(Mdiff * diff, dim=-1)/(2.0 * h ** 2))
    gradK = -Mdiff * K[:,:,None] / h ** 2
    return K, gradK.sum(0)

def naive(X, Y, h):
    batch, _ = X.shape
    Kxy = torch.empty(batch, batch)
    dx_Kxy = torch.zeros_like(X)
    dy_Kxy = torch.zeros_like(X)
    for i in range(batch):
        for j in range(batch):
            # using row vectors
            diff = X[i] - Y[j]
            norm = diff @ diff.T
            Kxy[i, j] = (-0.5 / h ** 2 * norm).exp()
            dx_Kxy[i] += -diff * Kxy[i, j] / h ** 2
            dy_Kxy[j] += diff * Kxy[i, j] / h ** 2
    return Kxy, dx_Kxy, dy_Kxy

rbf = GaussianKernel()
scaled_rbf = ScaledGaussianKernel()

In [2]:
BATCH, DIM = [20, 5]
torch.set_default_dtype(torch.double)
X = torch.randn(BATCH, DIM)
Y = torch.randn(BATCH, DIM)
M = torch.eye(DIM)
h = torch.tensor(1.0)

In [3]:
%timeit baseline(X, Y, h)
%timeit naive(X, Y, h)
%timeit rbf(X, Y, h)
%timeit scaled_rbf(X, Y, M, h)

508 µs ± 710 ns per loop (mean ± std. dev. of 7 runs, 1,000 loops each)
44.9 ms ± 102 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
149 µs ± 153 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)
163 µs ± 181 ns per loop (mean ± std. dev. of 7 runs, 10,000 loops each)


In [4]:
Kb, dKb = baseline(X, Y, h)
Ksb, dKsb = scaled_baseline(X, Y, M, h)
Kn, dKxn, dKyn = naive(X, Y, h)
Kr, dKr = rbf(X, Y, h)
Ks, dKs = scaled_rbf(X, Y, M, h)

Xg = X.clone().requires_grad_()
Yg = Y.clone().requires_grad_()
dKag = grad(scaled_baseline(Xg, Yg, M, h)[0], (Xg, Yg), torch.ones(20, 20))  # autograd

print("All covariance matrices equal: ", np.allclose(Kb, Ksb) and np.allclose(Kb, Kn) and np.allclose(Kb, Kr) and np.allclose(Kb, Ks))
print("All derivatives w.r.t. X equal to baseline: ", np.allclose(dKb, dKr) and np.allclose(dKb, dKs) and np.allclose(dKb, dKag[0]))
print("All derivatives w.r.t. X equal to scaled baseline: ", np.allclose(dKsb, dKr) and np.allclose(dKsb, dKs) and np.allclose(dKsb, dKag[0]))
print("All derivatives w.r.t. X equal to naive: ", np.allclose(dKxn, dKr) and np.allclose(dKxn, dKs) and np.allclose(dKxn, dKag[0]))

All covariance matrices equal:  True
All derivatives w.r.t. X equal to baseline:  False
All derivatives w.r.t. X equal to scaled baseline:  False
All derivatives w.r.t. X equal to naive:  True


### Conclusion

The gradient of the Kernel on the reference code seems to have a mistake on the aggregating axis. When the inputs are the same (X, X'), this results in a opposite sign to that of the autograd and naive implementation.