In [1]:
%load_ext autoreload
%autoreload 2
%config Completer.use_jedi = False

In [67]:
from datetime import datetime
from pathlib import Path
import sys
from tqdm import tqdm

import matplotlib.pyplot as plt
from pymatgen.core.structure import Molecule

Custom plotting code... ignore this if you don't know what it is, it should gracefully do nothing if you don't have the `MPLAdjutant` class. 

In [3]:
sys.path.append(str(Path.home() / Path("local")))
class NullClass:
    def do_nothing(*args, **kwargs):
        pass
    def add_colorbar(self, im, **kwargs):
        return plt.colorbar(im)
    def __getattr__(self, _):
        return self.do_nothing
try:
    from mpl_utils import MPLAdjutant
    adj = MPLAdjutant()
    adj.set_defaults()
except ImportError:
    adj = NullClass()

Append the `home` path of this project.

In [4]:
sys.path.append(str(Path.cwd().parent))

# Introduction

This notebook contains all of the processing scripts necessary for constructing FEFF input files, parsing the results, creating the ACSF feature inputs, etc. for the small molecule MD databases.

## Process the extended xyz files to memory

In [49]:
from xas_nne.xyz import process_extended_xyz_file_to_array
from xas_nne.feff import FeffWriter, load_completed_FEFF_results  # noqa

In [59]:
extended_xyz_path = "../data/molecule_md/Chmiela2017/benzene_old_dft.xyz"

In [60]:
res = process_extended_xyz_file_to_array(extended_xyz_path)

Read 8791762 lines from ../data/molecule_md/Chmiela2017/benzene_old_dft.xyz, each block has 12 atoms
Got 627983 snapshots


## Write the FEFF input files for each absorbing site

In [61]:
from pymatgen.core.sites import Site
from pymatgen.core.structure import Molecule

In [64]:
XANES = True  # EXAFS if False, XANES if True
ABSORBERS = ["C", "N", "O"]

In [69]:
current_absorbers = [aa for aa in ABSORBERS if aa in res["elements"]]

for ii, snapshot in enumerate(tqdm(res["coordinates"])):
    molecule = Molecule.from_sites([Site(species, coords) for species, coords in zip(res["elements"], snapshot)])
    f = FeffWriter(molecule, xanes=XANES, name=f"{ii:08}")
    for absorber in current_absorbers:
        dname = Path(f"{absorber}-{SPECTRUM_TYPE}") / Path(f"{ii:08}")
        f.write_feff_inputs(str(dname), absorber=absorber)

  0%|▏                                   | 3132/627983 [00:06<21:46, 478.42it/s]


KeyboardInterrupt: 

## Load the FEFF results

In this step, we also compile everything in one spot and `pickle` it so that reloading from disk is fast.

# Construct the ACSF feature vectors

We use the Atom-centered Symmetry Functions (ACSF) feature vectors as inputs for the ML models. See [here](https://singroup.github.io/dscribe/latest/tutorials/descriptors/acsf.html) for the `Dscribe` library implementation docs. This is the original paper:

> Jörg Behler. Atom-centered symmetry functions for constructing high-dimensional neural network potentials. J. Chem. Phys., 134(7):074106, 2011.

We use roughly the same parameters as in the original ænet paper:

> ...

In [None]:
from dscribe.descriptors import ACSF

Read in the results from the outputs of the previous scripts. This is a relatively large `pickle` file (~20 GB) so proceed with caution.

In [None]:
path = Path("../data/qm9/XANES-220622-C-N-O.pkl")
print(path.exists())
data = pickle.load(open(path, "rb"))

Setup some common grids for interpolating all of the spectra onto.

In [None]:
N = 200
grids = {
    "O": np.linspace(528, 582, N),
    "N": np.linspace(395, 449, N),
    "C": np.linspace(275, 329, N)
}

## Construct for each absorbing atom type

In [None]:
CENTRAL_ATOM = "C"   # C, N or O

In [None]:
species = ["H", "C", "O", "N", "F"]
rcut = 6.0
g2_params = [[1.0, 0], [0.1, 0], [0.01, 0]]
g4_params=[
    [0.001, 1.0, -1.0],
    [0.001, 2.0, -1.0],
    [0.001, 4.0, -1.0],
    [0.01, 1.0, -1.0],
    [0.01, 2.0, -1.0],
    [0.01, 4.0, -1.0],
    [0.1, 1.0, -1.0],
    [0.1, 2.0, -1.0],
    [0.1, 3.0, -1.0]
]
grid = grids[CENTRAL_ATOM]
acsf = ACSF(
    species=species,
    rcut=rcut,
    g2_params=g2_params,
    g4_params=g4_params
)

Generate the ACSF vectors.

In [None]:
origin_smiles = []
molecule_site_pairs = []
acsf_array = []
spectra = []

# cc = 0
for qm9id, datum in tqdm(data.items()):
    molecule = Molecule.from_dict(datum["molecule"])
    atoms = []
    central_atom_indexes = []
    for ii, site in enumerate(molecule):
        atom = Atom(site.specie.symbol, site.coords)
        atoms.append(atom)
        if site.specie.symbol == CENTRAL_ATOM:
            central_atom_indexes.append(ii)
    atoms = Atoms(atoms)
    
    if len(central_atom_indexes) == 0:
        continue

    tmp_acsf = acsf.create(atoms, positions=central_atom_indexes)
    
    for idx, ii in enumerate(central_atom_indexes):
        key = f"{ii}_{CENTRAL_ATOM}"
        s = np.array(datum["xanes"][key]["spectrum"])

        try:
            spline = InterpolatedUnivariateSpline(s[:, 0], s[:, 3])
        except IndexError:
            continue
            
        res = spline(grid)
        
        # Oxygen and Carbon screening condition for unphysical/outlier results
        if CENTRAL_ATOM == "O":
            if np.any(res[:10] > 5.0):
                continue
        elif CENTRAL_ATOM == "C":
            if np.any(res[:35] > 10.0):
                continue
        
        origin_smiles.append(datum["smiles"])
        spectra.append(res)
        acsf_array.append(tmp_acsf[idx])
        molecule_site_pairs.append(f"{qm9id}_{ii}")
    
#     cc += 1
    
#     if cc > 20:
#         break

acsf_array = np.array(acsf_array)
spectra = np.array(spectra)

## PCA for debugging

In [None]:
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler

In [None]:
pca_acsf = PCA(2)
w_acsf = pca_acsf.fit_transform(acsf_array)

In [None]:
pca_spectra = PCA(2)
w_spectra = pca_spectra.fit_transform(spectra)

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(3, 3))

ax.scatter(w_acsf[:, 0], w_acsf[:, 1], alpha=0.02, c=w_spectra[:, 0], s=1, cmap="rainbow")

plt.show()

In [None]:
fig, ax = plt.subplots(1, 1, figsize=(3, 2))

N = 100
M = 100

dig = np.digitize(w_acsf[::N, 0], bins=np.linspace(w_acsf[::N, 0].min(), w_acsf[::N, 0].max(), M))
cmap = cm.get_cmap("rainbow", M)

for ii, (spectrum, col, key) in enumerate(zip(spectra[::N], w_spectra[::N, 0], molecule_site_pairs[::N])):
    ax.plot(grid, spectrum, color=cmap(dig[ii]), alpha=0.1)

plt.show()

After confirming everything looks good, we save to disk.

In [None]:
now = datetime.now().strftime("%y%m%d")
fname = f"../data/qm9/XANES-{now}-ACSF-{CENTRAL_ATOM}.pkl"
print(fname)

We take the convention that `"x"` is the input and `"y"` is the output. These are the only two required keys for the ML pipeline. The rest is considered metadata.

In [None]:
pickle.dump(
    {"grid": grid, "y": spectra, "x": acsf_array, "names": molecule_site_pairs, "origin_smiles": origin_smiles},
    open(fname, "wb"),
    protocol=pickle.HIGHEST_PROTOCOL
)