In [1]:
# https://www.math.uwaterloo.ca/~hwolkowi/matrixcookbook.pdf
# !pip install --upgrade --user ase
# !pip install ipywidgets 
# !pip install ipympl
# !conda install nglview -c conda-forge -y
# !pip install --upgrade --user asap3
# !jupyter-nbextension enable nglview --py --sys-prefix
%load_ext autoreload
%autoreload 2
from utils import *
%matplotlib widget

# Gaussian Process 

## Univariate normal distribution


$$
\mathcal{N}(\mu, \sigma^2)
$$

$$
p(x \mid \mu, \sigma) = \frac{1}{\sqrt{2\pi\sigma^2}} \exp{ \left( -\frac{(x - \mu)^2}{2\sigma^2}\right)}
$$

In [2]:
def univariate_normal(x, mean, variance):
    """pdf of the univariate normal distribution."""
    return ((1. / np.sqrt(2 * np.pi * variance)) * 
            np.exp(-(x - mean)**2 / (2 * variance)))

In [3]:
# set up plot
univariate_plot(univariate_normal)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

interactive(children=(FloatSlider(value=0.0, description='mu', max=3.0, min=-3.0), FloatSlider(value=1.0, desc…

## Multivariate normal distribution

$$
p(\mathbf{x} \mid \mathbf{\mu}, \Sigma) = \frac{1}{\sqrt{(2\pi)^d \lvert\Sigma\rvert}} \exp{ \left( -\frac{1}{2}(\mathbf{x} - \mathbf{\mu})^T \Sigma^{-1} (\mathbf{x} - \mathbf{\mu}) \right)}
$$

$$
\mathcal{N}(\mathbf{\mu}, \Sigma)
$$

In [4]:
def multivariate_normal(x, d, mean, covariance):
    """pdf of the multivariate normal distribution."""
    x_m = x - mean
    return (1. / (np.sqrt((2 * np.pi)**d * np.linalg.det(covariance))) * 
            np.exp(-(np.linalg.solve(covariance, x_m).T.dot(x_m)) / 2))

## 2D Multivariate normal distribution
$$
\mathcal{N}\left(
\begin{bmatrix}
0 \\
0
\end{bmatrix}, 
\begin{bmatrix}
1 & 0 \\
0 & 1 
\end{bmatrix}\right)
$$

$$
\mathcal{N}\left(
\begin{bmatrix}
0 \\
1
\end{bmatrix}, 
\begin{bmatrix}
1 & 0.8 \\
0.8 & 1
\end{bmatrix}\right)
$$

In [5]:
plt.close('all')

multivariate_plot(multivariate_normal, nb_of_x=40)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

interactive(children=(FloatSlider(value=0.0, continuous_update=False, description='C', max=0.99, step=0.01), O…

## Marginal and Conditional normal distributions

If both $\mathbf{x}$ and $\mathbf{y}$ are [jointly normal](https://en.wikipedia.org/wiki/Multivariate_normal_distribution#Joint_normality) random vectors defined as:
$$
\begin{bmatrix}
\mathbf{x} \\
\mathbf{y} 
\end{bmatrix}
\sim
\mathcal{N}\left(
\begin{bmatrix}
\mu_{\mathbf{x}} \\
\mu_{\mathbf{y}}
\end{bmatrix},
\begin{bmatrix}
A & C \\
C^T & B
\end{bmatrix}
\right)
= \mathcal{N}(\mu, \Sigma)
, \qquad 
\Sigma^{-1} = \Lambda = 
\begin{bmatrix}
\tilde{A} & \tilde{C} \\
\tilde{C}^T & \tilde{B}
\end{bmatrix}
$$

The [conditional distribution](https://en.wikipedia.org/wiki/Conditional_probability_distribution) of $\mathbf{x}$ given $\mathbf{y}$ is defined as:

$$
p(\mathbf{x} \mid \mathbf{y}) = \mathcal{N}(\mu_{x|y}, \Sigma_{x|y})
$$

With:
$$\begin{split}
\Sigma_{x|y} & = A - CB^{-1}C^\top = \tilde{A}^{-1} \\
\mu_{x|y} & = \mu_x + CB^{-1}(\mathbf{y}-\mu_y)
\end{split}$$



### Proof

$
\begin{split}
p(x_1, x_2) = exp
\left[
-\frac{1}{2}
\begin{pmatrix}
x_1 - \mu_1 \\ 
x_2 - \mu_2
\end{pmatrix}^T
\begin{pmatrix}
\Sigma_{11} & \Sigma_{12} \\ 
\Sigma_{21} & \Sigma_{22}
\end{pmatrix}^{-1}
\begin{pmatrix}
x_1 - \mu_1 \\ 
x_2 - \mu_2
\end{pmatrix}
\right]
\end{split}
$

By using thr following idendty:

$
\begin{split}
M^{-1}=
\begin{pmatrix}
A & B \\ 
C & D
\end{pmatrix}^{-1}=
\begin{pmatrix}
I & 0 \\ 
-C^{-1}C & I
\end{pmatrix}
\begin{pmatrix}
(M/D)^{-1} & 0 \\ 
0 & D^{-1}
\end{pmatrix}
\begin{pmatrix}
I & -BD^{-1} \\ 
0 & I
\end{pmatrix}
\end{split}
$

We obtainig the folowing:

$
\begin{pmatrix}
x_1 - \mu_1 \\ 
x_2 - \mu_2
\end{pmatrix}^T
\begin{pmatrix}
\Sigma_{11} & \Sigma_{12} \\ 
\Sigma_{21} & \Sigma_{22}
\end{pmatrix}^{-1}
\begin{pmatrix}
x_1 - \mu_1 \\ 
x_2 - \mu_2
\end{pmatrix}=
\begin{pmatrix}
x_1 - \mu_1 \\ 
x_2 - \mu_2
\end{pmatrix}^T
\begin{pmatrix}
I & 0 \\ 
-\Sigma_{22}^{-1} \Sigma_{21} & I
\end{pmatrix}
\begin{pmatrix}
(\Sigma/\Sigma_{22})^{-1} & 0 \\ 
0 & \Sigma_{22}^{-1}
\end{pmatrix}
\begin{pmatrix}
I & -\Sigma_{12}\Sigma_{22}^{-1} \\ 
0 & I
\end{pmatrix}
\begin{pmatrix}
x_1 - \mu_1 \\ 
x_2 - \mu_2
\end{pmatrix}
$

And finally:

$
(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))^T(\Sigma/\Sigma_{22})^{-1}(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2)) + (x_2-\mu_2)^T\Sigma_22^{-1}(x_2-\mu_2)
$

And plugin back into the first Eq:

$
\begin{split}
p(x_1, x_2) = exp
\left[
-\frac{1}{2}
(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))^T(\Sigma/\Sigma_{22})^{-1}(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2)) + (x_2-\mu_2)^T\Sigma_22^{-1}(x_2-\mu_2)
\right]
\end{split}
$
$
\begin{split}
p(x_1, x_2) = exp
\left[
-\frac{1}{2}
(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))^T(\Sigma/\Sigma_{22})^{-1}(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))
\right] \cdot
exp
\left[
-\frac{1}{2}
(x_2-\mu_2)^T\Sigma_22^{-1}(x_2-\mu_2)
\right]
\end{split}
$

and from the axiom of probability we obtain the follwoing equlaity:

$
\begin{split}
p(x_1, x_2) = p(x_1|x_2)p(x_2)= exp
\left[
-\frac{1}{2}
(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))^T(\Sigma/\Sigma_{22})^{-1}(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))
\right] \cdot
exp
\left[
-\frac{1}{2}
(x_2-\mu_2)^T\Sigma_22^{-1}(x_2-\mu_2)
\right] = 
exp
\left[
-\frac{1}{2}
(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))^T(\Sigma/\Sigma_{22})^{-1}(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))
\right] \cdot
p(x_2)
\end{split} \\ 
\Rightarrow p(x_1|x_2) = exp
\left[
-\frac{1}{2}
(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))^T(\Sigma/\Sigma_{22})^{-1}(x_2 - \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2))
\right] \\
\mu_{1|2} = \mu_1 -\Sigma_{12}\Sigma_{22}^{-1}(x_2-\mu_2)
\Sigma_{1|2} = (\Sigma/\Sigma_{22})=\Sigma_{11}-\Sigma_{12}\Sigma_{22}^{-1}\Sigma_{21}
$

In [6]:
plt.close('all')
condition_plot(nb_of_x=40)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

interactive(children=(FloatSlider(value=0.0, continuous_update=False, description='x', max=1.0, min=-1.0), Flo…

## Gaussian Process
Gaussian process (GP) Is a method predicting $y^*$ for a given $x^*$ and getting: $y_i=f(x_i )$. 
GP assumes that p(f(x_1),...,f(x_N  )) is jointly Gaussian, i.e., the value of a new point m is defined as a multidimensional gaussian with $\mu(x)$ and $\Sigma(x)$, where $\Sigma(x)$ is calculated as $\Sigma_{ij}=k(x_i,x_j)$, and $k$ is the kernel function.<br/>
Which can be regarded as a "distance" function which defines how strongly the value f(x_i) is coupled to point f(x_j). 
When trying to predict a new point x^* we use prior data points, calculating the new kernel values, and finally obtaining the following new multidimensional Gaussian distribution:

$$
\begin{pmatrix}
f \\ 
f^*
\end{pmatrix}~
\mathcal{N}
\begin{pmatrix}
\begin{pmatrix}
\mu \\ 
\mu^*
\end{pmatrix},
\begin{pmatrix}
k & k^*\\ 
{k^{*}}^{T} & k^{**}
\end{pmatrix}
\end{pmatrix}
$$
<br/>
$f$ - Vector of all observed $y_i$ values, $f=y_i=f(x_i)$.<br/>
$f^*$ - Prediction function for point $x^*$, $f^*=y^*=f(x^*)$.<br/>
$\mu$ - Vector of all observed mean values.<br/>
$\mu^*$- Mean values for the prediction for x^*. <br/>
$k$ - Covariance matrix of all observed points<br/>
$k^*$ - Kernel vector $k^*=k(x^*,x_i )$<br/>
$k^{**}$ - Self kernel vector $k^{**}=k(x^*,x^* )$<br/>


### Applying the Conditional 
$$f(x^*)=\mu^* +{{k}^{*}}^{T}  k^{-1}(y-\mu)$$ <br/>
For simplicity, we can define the mean values to be zero<br/>
$$f(x^*) ={k^*}^{T}k^{-1}y=\sum_{i=1}^{N}\alpha_ik(x_i, x^*)$$

### The Most Popular Kernal
Here, we will use the squared exponential kernel, also known as Gaussian kernel or RBF kernel
$$
\kappa(\mathbf{x}_i,\mathbf{x}_j) = \sigma_f^2 \exp\left(-\frac{1}{2l^2}
  (\mathbf{x}_i - \mathbf{x}_j)^T
  (\mathbf{x}_i - \mathbf{x}_j)\right)\tag{10}
$$

In [7]:
# https://juanitorduz.github.io/gaussian_process_reg/
# http://krasserm.github.io/2018/03/19/gaussian-processes/
def kernel(X1, X2, l=1.0, sigma_f=1.0):
    '''
    Isotropic squared exponential kernel. Computes
    a covariance matrix from points in X1 and X2.

    Args:
        X1: Array of m points (m x d).
        X2: Array of n points (n x d).

    Returns:
        Covariance matrix (m x n).
    '''
    sqdist = np.sum(X1 ** 2, 1).reshape(-1, 1) + np.sum(X2 ** 2, 1) - 2 * np.dot(X1, X2.T)
    return sigma_f ** 2 * np.exp(-0.5 / l ** 2 * sqdist)

### The Conditional on A Nosey Dataset
If we have a training dataset with noisy function values $\mathbf{y} = \mathbf{f} + \boldsymbol\epsilon$ where noise $\boldsymbol\epsilon \sim \mathcal{N}(\mathbf{0}, \sigma_y^2 \mathbf{I})$ is independently added to each observation then the predictive distribution is given by

$$
\begin{align*}
p(\mathbf{f}_* \lvert \mathbf{X}_*,\mathbf{X},\mathbf{y}) &= \mathcal{N}(\mathbf{f}_* \lvert \boldsymbol{\mu}_*, \boldsymbol{\Sigma}_*) \\
\boldsymbol{\mu_*} &= \mathbf{K}_*^T \mathbf{K}_y^{-1} \mathbf{y} \\
\boldsymbol{\Sigma_*} &= \mathbf{K}_{**} - \mathbf{K}_*^T \mathbf{K}_y^{-1} \mathbf{K}_*
\end{align*}
$$
where $\mathbf{K}_y = \mathbf{K} + \sigma_y^2\mathbf{I}$.

In [8]:
def posterior_predictive(X_s, X_train, Y_train, l=1.0, sigma_f=1.0, sigma_y=1e-8):
    '''
    Computes the suffifient statistics of the GP posterior predictive distribution
    from m training data X_train and Y_train and n new inputs X_s.

    Args:
        X_s: New input locations (n x d).
        X_train: Training locations (m x d).
        Y_train: Training targets (m x 1).
        l: Kernel length parameter.
        sigma_f: Kernel vertical variation parameter.
        sigma_y: Noise parameter.

    Returns:
        Posterior mean vector (n x d) and covariance matrix (n x n).
    '''
    K = kernel(X_train, X_train, l, sigma_f) + sigma_y ** 2 * np.eye(len(X_train))
    K_s = kernel(X_train, X_s, l, sigma_f)
    K_ss = kernel(X_s, X_s, l, sigma_f) + 1e-8 * np.eye(len(X_s))
    K_inv = inv(K)


    mu_s = K_s.T.dot(K_inv).dot(Y_train)


    cov_s = K_ss - K_s.T.dot(K_inv).dot(K_s)

    return mu_s, cov_s

In [9]:
plt.close()
noise = 0.1
x = [-3, -2 , -1,  1, 2 , 3, 4]
def f(x):
    func = np.cos(x)
    return func
    
gaussian_process(x, f, noise, posterior_predictive, kernel)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

In [10]:
plt.close('all')
noise = 0.1
kernal = ConstantKernel(1.0) * RBF(length_scale=1.0)
def f(x):
    func = np.sin(x)
    return func

gaussian_process_interactive(f, noise, kernal)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## GP Force Field 
a straightforward formulation of a vector-valued estimator
takes the form:
$$
\mathbf{\hat{f}}=\begin{bmatrix}\hat{f}_1(x), \dots, \hat{f}_N\end{bmatrix}^{T}
$$
$\mathbf{\hat{f}}: \mathbb{R}^N\rightarrow\mathbb{R}^N$ where each component: ${\hat{f}_i}: \mathbb{R}^N\rightarrow\mathbb{R}$

In [105]:
# plt.close('all')
"""Demonstrates molecular dynamics with constant temperature."""
import os
from ase.lattice.cubic import FaceCenteredCubic
from ase.md.langevin import Langevin
from ase.io.trajectory import Trajectory
from ase import io
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from ase import units
import time
use_asap = False
from ase.calculators.emt import EMT
size = 2

T = 300  # Kelvin
atom_type = 'Cu'

# Set up a crystal
atoms = FaceCenteredCubic(directions=[[1, 0, 0], [0, 1, 0], [0, 0, 1]],
                          symbol=atom_type,
                          size=(size, size, size),
                          pbc=True)

# Describe the interatomic interactions with the Effective Medium Theory
atoms.calc = EMT()
# Set the momenta corresponding to T=300K
MaxwellBoltzmannDistribution(atoms, 600 * units.kB)

# We want to run MD with constant energy using the Langevin algorithm
# with a time step of 5 fs, the temperature T and the friction
# coefficient to 0.02 atomic units.
dyn = Langevin(atoms, 5 * units.fs, T * units.kB, 0.002)

t0 = time.time()
def printenergy(a=atoms):  # store a reference to atoms in the definition.
    """Function to print the potential, kinetic and total energy."""
    epot = a.get_potential_energy() / len(a)
    ekin = a.get_kinetic_energy() / len(a)
    print('Energy per atom: Epot = %.3feV  Ekin = %.3feV (T=%3.0fK)  '
          'Etot = %.3feV, time elapsed:%3.0f' % (epot, ekin, ekin / (1.5 * units.kB), epot + ekin, time.time()-t0))
    
dyn.attach(printenergy, interval=100)

# We also want to save the positions of all atoms after every 100th time step.
traj = Trajectory('moldyn.traj', 'w', atoms)
dyn.attach(traj.write, interval=1)
print(atoms)

# Now run the dynamics
try:
    os.makedirs('snapshots')
except FileExistsError:
    pass
printenergy()
steps = 1999
dyn.run(steps)

Lattice(symbols='Cu32', pbc=True, cell=[7.22, 7.22, 7.22], momenta=..., calculator=EMT(...))
Energy per atom: Epot = -0.006eV  Ekin = 0.094eV (T=729K)  Etot = 0.089eV, time elapsed:  0
Energy per atom: Epot = -0.006eV  Ekin = 0.094eV (T=729K)  Etot = 0.089eV, time elapsed:  0
Energy per atom: Epot = 0.044eV  Ekin = 0.045eV (T=347K)  Etot = 0.089eV, time elapsed:  6
Energy per atom: Epot = 0.041eV  Ekin = 0.046eV (T=355K)  Etot = 0.087eV, time elapsed: 12
Energy per atom: Epot = 0.041eV  Ekin = 0.047eV (T=367K)  Etot = 0.089eV, time elapsed: 18
Energy per atom: Epot = 0.048eV  Ekin = 0.034eV (T=262K)  Etot = 0.082eV, time elapsed: 24
Energy per atom: Epot = 0.027eV  Ekin = 0.052eV (T=406K)  Etot = 0.079eV, time elapsed: 30
Energy per atom: Epot = 0.042eV  Ekin = 0.037eV (T=287K)  Etot = 0.079eV, time elapsed: 36
Energy per atom: Epot = 0.037eV  Ekin = 0.040eV (T=308K)  Etot = 0.077eV, time elapsed: 42
Energy per atom: Epot = 0.041eV  Ekin = 0.037eV (T=287K)  Etot = 0.079eV, time elapsed

True

In [106]:
from ase.io.trajectory import Trajectory
from ase.visualize import ngl
from ase import Atom, Atoms
traj = Trajectory('moldyn.traj', 'r', atoms)
ngl.view_ngl(traj, w=500, h=500)

HBox(children=(NGLWidget(max_frame=1999), VBox(children=(Dropdown(description='Show', options=('All', 'Cu'), v…

Instead of mapping to scalar outputs, we can alternatively model the covariance
function as a matrix $k : \chi × \chi → \mathbb{R}^N\rightarrow\mathbb{R}^{N\times N}$ that expresses the interaction among
multiple output components. Together with a vector-valued mean function $\mu : \chi →
\mathbb{R}^N$ , we can then sample realizations of vector-valued functions from the GP
$$
\mathbf{\hat{f}} ∼ \mathcal{GP}\begin{bmatrix}\mathbf{\mu(x)}, \mathbf{k(x,x^{'})}\end{bmatrix}
$$

### the navie approch:
$$
\mathbf{\hat{f}}=\begin{bmatrix}\hat{f}_1(x), \dots, \hat{f}_N\end{bmatrix}^{T}
$$
$$
\hat{f}_i(x) = \mathcal{GP}\begin{bmatrix}\mathbf{\mu(x)_i}, \mathbf{k(x,x^{'})_i}\end{bmatrix}
$$
$$
\mathbf{\hat{f}}=\begin{bmatrix}\hat{f}_1(x) \\ \dots \\ \hat{f}_N(x)\end{bmatrix}
=\begin{bmatrix}\mathcal{GP}\begin{bmatrix}\mathbf{\mu(x)_1}, \mathbf{k(x,x^{'})_1}\end{bmatrix} \\ 
\dots
\\ \mathcal{GP}\begin{bmatrix}\mathbf{\mu(x)_N}, \mathbf{k(x,x^{'})_N}\end{bmatrix}\end{bmatrix}
$$

In [13]:
traj = Trajectory('moldyn.traj', 'r', atoms)
eps = 0.1
start, every = 1000, 10
y, x = [], []
y_test, x_test = [], []
for i, config in enumerate(traj):
    if i % every == 0 and i > start:
        if i <= (len(traj) + start)/2:
            y.append(config.get_forces().reshape([1, -1]))
            x.append(config.get_positions().reshape([1, -1]))
        else:
            x_test.append(config.get_positions().reshape([1, -1]))
            y_test.append(config.get_forces().reshape([1, -1]))
    
x, y = np.concatenate(x, axis=0), np.concatenate(y, axis=0)
x_test, y_test = np.concatenate(x_test, axis=0), np.concatenate(y_test, axis=0)


In [14]:
from sklearn.gaussian_process.kernels import ConstantKernel, RBF
noise = 0.2
kernal= ConstantKernel(1.0) * RBF(length_scale=1.0)
gpr = GaussianProcessRegressor(kernel=kernal, alpha=noise**2)
gpr.fit(x, y)

GaussianProcessRegressor(alpha=0.04000000000000001,
                         kernel=1**2 * RBF(length_scale=1))

In [15]:
def plot_gpr(gpr, x, y, x_test, y_test):
    y_hat_train = gpr.predict(x)
    y_hat_train_norm = np.linalg.norm(y_hat_train, axis=1)
    y_train_norm = np.linalg.norm(y, axis=1)
    y_hat = gpr.predict(x_test)
    y_hat_norm = np.linalg.norm(y_hat, axis=1)
    y_test_norm = np.linalg.norm(y_test, axis=1)

    plt.close('all')
    fig, axis = plt.subplots(ncols=1, nrows=2, sharex=True)
    x1 = np.arange(len(y_hat_train_norm))
    axis[0].plot(x1, y_hat_train_norm, label='predicted on train')
    axis[0].plot(x1, y_train_norm, label='training')
    x2 = np.arange(len(y_hat_norm))+len(y_hat_train_norm)
    axis[0].plot(x2, y_hat_norm, label='predicted')
    axis[0].plot(x2, y_test_norm, label='ground truth')
    axis[0].set_xticks([])
    axis[0].set_title('The force norm')
    axis[0].legend()

    x1 = np.arange(len(y_hat_train_norm))
    axis[1].plot(x1, y_hat_train_norm-y_train_norm, label='diffrence on train')
    x2 = np.arange(len(y_hat_norm))+len(y_hat_train_norm)
    axis[1].plot(x2, y_hat_norm-y_test_norm, label='diffrence on test')
    axis[1].legend()
    plt.show()


#### some tricks
##### change the kernal:
##### The Matérn kernel
For our application, we considered a subclass from the parametric
Matérn family (22–24) of (isotropic) kernel functions
$$
k: C_{\mu=n+\frac{1}{2}}(d)=exp{-\frac{\sqrt{2\nu}d}{\sigma}}P_{n}(d)
$$
$$
P_n(d)=\sum_{k=0}^{n}{\frac{(n+k)!}{(2n)!}}\begin{pmatrix}n\\k\end{pmatrix}\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}
$$

In [16]:
from sklearn.gaussian_process.kernels import ConstantKernel, RBF, Matern
noise = 0.2
kernal= Matern(length_scale=1.0, nu=2.5)
gpr = GaussianProcessRegressor(kernel=kernal, alpha=noise**2)
gpr.fit(x, y)
plot_gpr(gpr, x, y, x_test, y_test)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

#### normalize y
$$
\hat{y}_{train} = \frac{y_{train}-E[y_{train}]}{\sqrt{Var[y_{train}]}}
$$

In [17]:
noise = 0.2
kernal= Matern(length_scale=1.0, nu=2.5)
gpr = GaussianProcessRegressor(kernel=kernal, alpha=noise**2, normalize_y=True)
gpr.fit(x, y)
plot_gpr(gpr, x, y, x_test, y_test)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

### Roto-translational invariance
Covariance functions remain valid under any transformation of their domain
) is again a kernel function. A rather trivial implication is that all
invariances of that input transformation are inherited, providing yet another opportunity
to characterize the properties of the predictor [92].

The so-called Coulomb matrix representation [7] goes one step further and represents
each pair of nuclei in terms their Coulomb interaction instead of a simple distance. The
Coulomb energy is the only nuclei-nuclei interaction term in the Hamiltonian and empirically a good starting point for inference about molecular properties [9]. We use a slight
variation of this descriptor for our purpose, whereby atoms of different type interact on a
normalized scale,
$$
D_{ij}=\begin{matrix}\lVert{r_i - r_j}\lVert & i > j \\ 0 & i \leq j \end{matrix} 
$$

In [18]:
from scipy.spatial.distance import pdist
def x_to_d(x):
    d = []
    for xi in x:
        xi = xi.reshape([-1, 3])
        d.append(pdist(xi))
    return np.stack(d, axis=0)

d, d_test = x_to_d(x), x_to_d(x_test)
noise = 0.5
kernal= Matern(length_scale=1.0, nu=2.5)
gpr = GaussianProcessRegressor(kernel=kernal, alpha=noise**2, normalize_y=True)
gpr.fit(d, y)
plot_gpr(gpr, d, y, d_test, y_test)

Canvas(toolbar=Toolbar(toolitems=[('Home', 'Reset original view', 'home', 'home'), ('Back', 'Back to previous …

## incoding physical insight to the model
In this setting, the corresponding RKHS is vector-valued and it has been shown
that the representer theorem continues to hold. Each component of the kernel
function $k_{ij}$ specifies a covariance between a pair of outputs $f_i(x)$ and $f_j(x)$, which
makes it straightforward to impose linear constraints $g(x) = \hat{G}[\mathbf{f(x)}]$ on the GP
prior
$$
\mathbf{\hat{g}(x)} ∼ \mathcal{GP}\begin{bmatrix}\mathbf{\hat{G}\mu(x)}, \hat{G}\mathbf{k(x,x^{'})}{\hat{G}'}^{T}\end{bmatrix}
\\
for A, B\; linear\; operators\\
Cov[Ax,By]=ACov[x,y]B^{T}\\
E[Ax] = AE[x]
$$

Here, we aim to construct a GP that inherits the correct structure of a conservative
force field to ensure integrability, so that the corresponding energy potential can
be recovered from the same model. We start by considering, that the force field
estimator $\mathbf{\hat{f}_{F}(x)}$ and the PES estimator $\hat{f}_{E}(x)$ are related via some operator $\hat{G}$ . To
impose energy conservation, we require that the curl vanishes  for every
input to the transformed energy model:
$$
\nabla\times \hat{G}\begin{bmatrix}\hat{f}_E\end{bmatrix} = \mathbf{0}
$$
As expected, this is satisfied by the derivative operator Gˆ = ∇ or, in the case of
energies and forces, the negative gradient operator
$$
\mathbf{\hat{f}_{F}(x)}=\hat{G}\begin{bmatrix}\hat{f}_E\end{bmatrix}=-\nabla\hat{f}_E
$$

Since differentiation is a linear operator, the result is another GP with realizations $\mathbf{f_F}: \chi^{3N}\rightarrow \mathbb{R}^{3N}$
$$
\mathbf{\hat{f}_F(x)} ∼ \mathcal{GP}\begin{bmatrix}\mathbf{-\nabla\mu(x)}, \nabla_x\mathbf{k(x,x^{'})}{ \nabla_{x'}}^{T}\end{bmatrix}
$$

$$
\nabla\mathbf{k}{ \nabla}^{T}=Hess_x(k)
$$
$$
\begin{bmatrix}Hess_s(k)\end{bmatrix}_{ij}=\frac{\partial^2 k}{\partial x_i \partial x_j}
$$

This Hessian kernel gives rise to the following gradient domain machine
learning [88, 89] force model as the posterior mean of the corresponding GP:<br>
he trained force field estimator collects the contributions of the
partial derivatives 3N of all training points M to compile the prediction.
It takes the form
$$
\mathbf{\hat{f}_F(x)}=\sum_i^M{\sum_j^{3N}{(\mathbf{\alpha_i})_j\frac{\partial}{\partial x_j}\nabla k(\mathbf{x}, \mathbf{x_i})}}
$$
    
Because the trained model is a (fixed) linear combination of kernel functions,
integration only affects the kernel function itself. The corresponding expression for
the energy predictor
$$
\mathbf{\hat{f}_E(x)}=\sum_i^M{\sum_j^{3N}{(\mathbf{\alpha_i})_j\frac{\partial}{\partial x_j} k(\mathbf{x}, \mathbf{x_i})}}
$$

In [107]:
from sgdml.cli import _print_dataset_properties, _print_model_properties
from data_utils import from_traj
dataset = from_traj('moldyn.traj', overwrite=True)
# test = np.load('ethanol_dft.npz')
_print_dataset_properties(dataset)

[1;37;40m[INFO][0m Overwriting existing dataset file.
Writing dataset to 'moldyn.npz'...
Number geometries found: 2,000

[1;32;40m[DONE][0m
[1;37mDataset properties[0m
  Name:              moldyn (32 atoms)
  Theory:            unknown
  Size:              2,000 data points
  Lattice:           a    b    c   
                     7.22 0    0   
                     0    7.22 0   
                     0    0    7.22
    Lengths:         a = 7.22, b = 7.22, c = 7.22
    Angles [deg]:    alpha = 90, beta = 90, gamma = 90
  Energies [eV]:
    Range:           -0.182 |--   2.45   --| 2.27     
    Mean:            1.086
    Variance:        0.045
  Forces [eV/Ang]:
    Range:           -2.22 |--   4.57   --| 2.35     
    Mean:            -0.000
    Variance:        0.218
  Fingerprint:       b'9ca0c94ba034022927e2111ca37f1021'

[1;37mExample geometry[0m (no. 911, chosen randomly)
  Copy&paste the string below into Jmol (www.jmol.org), Avogadro (www.avogadro.cc), etc. to
  visualize

In [None]:
import numpy as np
from sgdml.train import GDMLTrain

n_train = 10

try:
    gdml_train = GDMLTrain()
except:
    pass

task = gdml_train.create_task(dataset, n_train,\
        valid_dataset=dataset, n_valid=1000,\
        sig=10, lam=1e-15)

try:
        model = gdml_train.train(task)
except Exception as err:
        sys.exit(err)
else:
        np.savez_compressed('my_model.npz', **model)
model =  np.load('my_model.npz', allow_pickle=True)
        
_print_model_properties(model)
del gdml_train

[1;32;40m[100%][0m Bi-partite matching
[1;32;40m[DONE][0m Multi-partite matching (permutation synchronization)
[1;32;40m[DONE][0m Symmetry group completion [90m                                                 found 1 symmetries[0m


In [27]:
import numpy as np
from sgdml.predict import GDMLPredict
from sgdml.utils import io
import ase

model = np.load('my_model.npz')
ase.io.write('my_lattice.xyz', atoms)
gdml = GDMLPredict(model)

r,_ = io.read_xyz('my_lattice.xyz') # 4 atoms
e,f = gdml.predict(r)

print(r.shape) 
print(e.shape) 
print(f.shape) 

(1, 12)
(1,)
(1, 12)


In [None]:
import numpy as np
from sgdml.predict import GDMLPredict
from sgdml.utils import io
import ase
def plot_gdml(model, traj, start= 1000, every=10):
    gdml = GDMLPredict(model)
    y, x = [], []
    y_test, x_test = [], []
    for i, config in enumerate(traj):
        if i % every == 0 and i > start:
            if i <= (len(traj) + start)/2:
                ase.io.write('temp.xyz', atoms)
                r,_ = io.read_xyz('my_lattice.xyz')
                y.append(config.get_forces().reshape([1, -1]))
                x.append(config.get_positions().reshape([1, -1]))
            else:
                x_test.append(config.get_positions().reshape([1, -1]))
                y_test.append(config.get_forces().reshape([1, -1]))

    
    y_hat_train = gpr.predict(x)
    y_hat_train_norm = np.linalg.norm(y_hat_train, axis=1)
    y_train_norm = np.linalg.norm(y, axis=1)
    y_hat = gpr.predict(x_test)
    y_hat_norm = np.linalg.norm(y_hat, axis=1)
    y_test_norm = np.linalg.norm(y_test, axis=1)

    plt.close('all')
    fig, axis = plt.subplots(ncols=1, nrows=2, sharex=True)
    x1 = np.arange(len(y_hat_train_norm))
    axis[0].plot(x1, y_hat_train_norm, label='predicted on train')
    axis[0].plot(x1, y_train_norm, label='training')
    x2 = np.arange(len(y_hat_norm))+len(y_hat_train_norm)
    axis[0].plot(x2, y_hat_norm, label='predicted')
    axis[0].plot(x2, y_test_norm, label='ground truth')
    axis[0].set_xticks([])
    axis[0].set_title('The force norm')
    axis[0].legend()

    x1 = np.arange(len(y_hat_train_norm))
    axis[1].plot(x1, y_hat_train_norm-y_train_norm, label='diffrence on train')
    x2 = np.arange(len(y_hat_norm))+len(y_hat_train_norm)
    axis[1].plot(x2, y_hat_norm-y_test_norm, label='diffrence on test')
    axis[1].legend()
    plt.show()
    
traj = Trajectory('moldyn.traj', 'r', atoms)
start, every = 1000, 10
plot_gdml(model, traj, start= 1000, every=10)

In [None]:
from sgdml.intf.ase_calc import SGDMLCalculator

from ase.io import read
from ase.optimize import QuasiNewton
from ase.md.velocitydistribution import (MaxwellBoltzmannDistribution, Stationary, ZeroRotation)
from ase.md.verlet import VelocityVerlet
from ase import units

model_path = 'my_model.npz'
calc = SGDMLCalculator(model_path)

mol = atoms
mol.set_calculator(calc)

# do a quick geometry relaxation
# qn = QuasiNewton(mol)
# qn.run(1e-4, 100)

# set the momenta corresponding to T=300K
MaxwellBoltzmannDistribution(mol, 300 * units.kB)
Stationary(mol) # zero linear momentum
ZeroRotation(mol) # zero angular momentum

# run MD with constant energy using the VelocityVerlet algorithm
dyn = VelocityVerlet(mol, 0.2 * units.fs, trajectory='md.traj')  # 0.2 fs time step.

t0 = time.time()
def printenergy(a=atoms):  # store a reference to atoms in the definition.
    """Function to print the potential, kinetic and total energy."""
    epot = a.get_potential_energy() / len(a)
    ekin = a.get_kinetic_energy() / len(a)
    print('Energy per atom: Epot = %.3feV  Ekin = %.3feV (T=%3.0fK)  '
          'Etot = %.3feV, time elapsed:%3.0f' % (epot, ekin, ekin / (1.5 * units.kB), epot + ekin, time.time()-t0))
    
dyn.attach(printenergy, interval=100)

# now run the dynamics
printenergy(mol)
dyn.run(1000)
try:
    del calc
except:
    pass


### For scalar output
$$
\begin{bmatrix}f^*\end{bmatrix}^{1\times1} = \begin{bmatrix}{k^*}^T\end{bmatrix}^{1\times M}\begin{bmatrix}\begin{bmatrix}K^{-1}\end{bmatrix}^{M \times M}\begin{bmatrix}y\end{bmatrix}^{M\times 1}\end{bmatrix}^{M\times1}=\sum_{i}^{M}\alpha_i k(x_i,x^*)
$$

### For vector output
$$
\begin{bmatrix}\mathbf{f^*}\end{bmatrix}^{3N\times1} ={k^*}^{T}K^{-1}\mathbf{f}=\sum_i^M{\sum_j^{3N}{(\mathbf{\alpha_i})_j\frac{\partial}{\partial x_j}\nabla k(\mathbf{x}, \mathbf{x_i})}}
$$

$$
K=\nabla_{x} k(x,x') \nabla_{x^{'}}^\top = 
\begin{pmatrix}
\begin{pmatrix}\nabla_{x^{(1)}} k(x^{(1)},x^{(1)}) \nabla_{x^{(1)}}^\top\end{pmatrix} & \dots & \begin{pmatrix}\nabla_{x^{(M)}} k(x^{(M)},x^{(1)}) \nabla_{x^{(1)}}^\top\end{pmatrix} \\
\dots & \dots & \dots \\
\begin{pmatrix}\nabla_{x^{(M)}} k(x^{(M)},x^{(1)}) \nabla_{x^{(1)}}^\top\end{pmatrix} & \dots & \begin{pmatrix}\nabla_{x^{(M)}} k(x^{(M)},x^{(M)}) \nabla_{x^{(M)}}^\top\end{pmatrix}
\end{pmatrix}
\\
\nabla k(x^{(i)}, x^{(j)})\nabla ^\top=\begin{bmatrix}\frac{\partial}{\partial x^{(j)}_1 }\nabla k(x^{(i)},x^{(j)}),\dots,\frac{\partial}{\partial x^{(j)}_{3N} }\nabla k(x^{(i)},x^{(j)})\end{bmatrix}^{3N\times 3N}
=\begin{bmatrix}\frac{\partial}{\partial x^{(j)}_1 }\frac{\partial}{\partial x^{(i)}_{1}} k(x^{(i)},x^{(j)}) &\dots & \frac{\partial}{\partial x^{(j)}_{3N} }\frac{\partial}{\partial x^{(i)}_{1}} k(x^{(i)},x^{(j)}) \\
\dots & \dots & \dots \\
\frac{\partial}{\partial x^{(j)}_1}\frac{\partial}{\partial x^{(i)}_{3N}} k(x^{(i)},x^{(j)}) &\dots & \frac{\partial}{\partial x^{(j)}_{3N} }\frac{\partial}{\partial x^{(i)}_{3N}} k(x^{(i)},x^{(j)})
\end{bmatrix}^{3N\times 3N}
\\
\begin{bmatrix}K\end{bmatrix}^{M\cdot3N\times M\cdot3N} \;
\begin{bmatrix}\mathbf{f}\end{bmatrix}^{M\cdot3N\times 1}
\\
$$
$$
{k^*}^{T}
=\nabla_x k(x^{*}, x)\nabla_{x^{'}} ^\top 
=
\begin{bmatrix}
\begin{pmatrix}\nabla_{x^*} k(x^{*}, x^{(1)})\nabla_{x^{(1)}} ^\top\end{pmatrix} & 
\dots & 
\begin{pmatrix}\nabla k(x^{*}, x^{(M)})\nabla_{x^{(M)}} ^\top\end{pmatrix}
\end{bmatrix}^{3N\times M\cdot 3N}
= 
\begin{bmatrix}
\begin{pmatrix}\frac{\partial}{\partial x^{(1)}_1 }\nabla k(x^{*},x^{(1)}),\dots,\frac{\partial}{\partial x^{(1)}_{3N} }\nabla k(x^{*},x^{(1)})\end{pmatrix} & 
\dots & 
\begin{pmatrix}\frac{\partial}{\partial x^{(M)}_1 }\nabla k(x^{*},x^{(M)}),\dots,\frac{\partial}{\partial x^{(M)}_{3N} }\nabla k(x^{*},x^{(M)})\end{pmatrix}
\end{bmatrix}^{3N\times M\cdot 3N}
\\
\\
\begin{bmatrix}\mathbf{f^*}\end{bmatrix}^{3N\times1} =\begin{bmatrix}\begin{bmatrix}{k^*}^{T}\end{bmatrix}^{3N \times M \cdot 3N}\begin{bmatrix}\begin{bmatrix}K^{-1}\end{bmatrix}^{M\cdot 3N \times M\cdot 3N}\begin{bmatrix}\mathbf{f}\end{bmatrix}^{M\cdot 3N \times 1}\end{bmatrix}^{M \cdot 3N \times 1}\end{bmatrix}^{3N\times 1}
\\
\begin{bmatrix}\mathbf{f^*}\end{bmatrix}^{3N\times1} =
{k^*}^{T}
K^{-1}
\mathbf{f}=
\begin{pmatrix}
\begin{pmatrix}\frac{\partial}{\partial x^{(1)}_1 }\nabla k(x^{*},x^{(1)}),\dots,\frac{\partial}{\partial x^{(1)}_{3N} }\nabla k(x^{*},x^{(1)})\end{pmatrix} & 
\dots & 
\begin{pmatrix}\frac{\partial}{\partial x^{(M)}_1 }\nabla k(x^{*},x^{(M)}),\dots,\frac{\partial}{\partial x^{(M)}_{3N} }\nabla k(x^{*},x^{(M)})\end{pmatrix}
\end{pmatrix}
\underbrace{ K^{-1}\mathbf{f}}_{\alpha _{ij}}
$$

### The Matérn kernel
For our application, we considered a subclass from the parametric
Matérn family (22–24) of (isotropic) kernel functions
$$
k: C_{\nu=n+\frac{1}{2}}(d)=exp{-\frac{\sqrt{2\nu}d}{\sigma}}P_{n}(d)
$$
$$
P_n(d)=\sum_{k=0}^{n}{\frac{(n+k)!}{(2n)!}}\begin{pmatrix}n\\k\end{pmatrix}\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}
$$

### The full kernel

$$
\mathbf{k_F(x, x^{'})}=\nabla k(x, x^{'})\nabla ^\top=\begin{bmatrix}\frac{\partial}{\partial x^{'}_1 }\nabla k(x,x^{'}),\dots,\frac{\partial}{\partial x^{'}_{3N} }\nabla k(x,x^{'})\end{bmatrix}
$$
$$
=\begin{pmatrix}5(\mathbf{x-x^{'}})(\mathbf{x-x^{'}})^\top-\mathbb{1}\sigma(\sigma +\sqrt{5}d))\end{pmatrix}\frac{5}{3\sigma^{4}}exp\begin{pmatrix}-\frac{\sqrt{5}d}{\sigma}\end{pmatrix}
$$
$$
\mathbf{k_F(x, x^{'})}\in \mathbb{R}^{3N \times 3N}
$$

$$
\mathbf{k_E(x, x^{'})}=k(x, x^{'})\nabla ^\top
=5(\mathbf{x-x^{'}})(\sigma+d)\frac{5}{3\sigma^{3}}exp\begin{pmatrix}-\frac{\sqrt{5}d}{\sigma}\end{pmatrix}
$$
$$
\mathbf{k_E(x, x^{'})}\in \mathbb{R}^{1 \times 3N}
$$


### Matérn covariance derivatives
$$
k: C_{\nu=n+\frac{1}{2}}(d)=B(d)P_{n}(d)
\\
B(d) = exp{\begin{pmatrix}-\frac{\sqrt{2\nu}d}{\sigma}\end{pmatrix}}
\\
P_n(d)=\sum_{k=0}^{n}{\frac{(n+k)!}{(2n)!}}\begin{pmatrix}n\\k\end{pmatrix}\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}
$$

$$
\frac{\partial \kappa}{\partial x_i} = \frac{\partial P_n}{\partial x_i}B + \frac{\partial B}{\partial x_i}P_n
$$

$$
\frac{\partial P_n}{\partial x_i}=\sum_{k=0}^n{\frac{(n+k)!}{(2n)!}\begin{pmatrix}n\\k\end{pmatrix}\frac{(n-k)(x_i -x^{'}_i)}{d^2}\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}}\\
$$

$$
\frac{\partial B}{\partial x_i} =-\frac{\sqrt{2\nu}(x_i -x^{'}_i)}{\sigma d}exp{-\frac{\sqrt{2\nu}d}{\sigma}}
$$

$$
\frac{\partial^2 \kappa}{\partial x_i \partial x_j} = 
B\frac{\partial^2 P_n}{\partial x_i \partial x_j} +
\frac{\partial B}{\partial x_i}\frac{\partial P_n}{\partial x_j} +
\frac{\partial B}{\partial x_j}\frac{\partial P_n}{\partial x_i} +
P_n\frac{\partial^2 B}{\partial x_i \partial x_j} 
$$

$$
\begin{bmatrix}\frac{\partial^2 P_n}{\partial x_i \partial x_j}\end{bmatrix}_{i\ne j}=
\sum_{k=0}^{n}{\frac{(n+k)!}{(2n)!}\begin{pmatrix}n \\ k \end{pmatrix}
\frac{(n-k-2)(n-k)(x_i -x^{'}_i)(x_j - x^{'}_j)}{d^4}
\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}}
$$

$$
\begin{bmatrix}\frac{\partial^2 P_n}{\partial x_i \partial x_j}\end{bmatrix}_{i=j}=
\sum_{k=0}^{n}{\frac{(n+k)!}{(2n)!}\begin{pmatrix}n \\ k \end{pmatrix}
\frac{(n-k-2)(n-k)(x_i -x^{'}_i)^2}{d^4}
\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}} + \sum_{k=0}^n{\frac{(n+k)!}{(2n)!}\begin{pmatrix}n\\k\end{pmatrix}\frac{(n-k)}{d^2}\begin{pmatrix}\frac{2\sqrt{2\nu}d}{\sigma}\end{pmatrix}^{n-k}}\\
$$

$$
\begin{bmatrix}\frac{\partial^2 B}{\partial x_i \partial x_j}\end{bmatrix}_{i\ne j} =
\frac{\sqrt{2\nu}(x_i-x^{'})(x_j-x^{'}_j)(\sqrt{2\nu} d+\sigma)}{\sigma^2d^3}\exp{\begin{pmatrix}-\frac{\sqrt{2\nu}d}{\sigma}\end{pmatrix}}
$$

$$
\begin{bmatrix}\frac{\partial^2 B}{\partial x_i \partial x_j}\end{bmatrix}_{i = j} =
\frac{\sqrt{2\nu}(x_i-x^{'})(x_j-x^{'}_j)(\sqrt{2\nu} d+\sigma)}{\sigma^2d^3}\exp{\begin{pmatrix}-\frac{\sqrt{2\nu}d}{\sigma}\end{pmatrix}} - \frac{\sqrt{2\nu}(x_i -x^{'}_i)}{\sigma d}exp{\begin{pmatrix}-\frac{\sqrt{2\nu}d}{\sigma}\end{pmatrix}}
$$