Copyright Preferred Networks inc. as contributors to Matlantis contrib project.

# LightPFP: SiO2-P2O5-Al2O3-Na2O glass

This notebook makes a lightPFP model for SiO2-P2O5-Al2O3-Na2O glass system.
The composition of SiO2 varies between 20.69% and 70.69%
The composition of P2O5 vaires between 0% and 50%

We follow the paper ["Ab initio molecular dynamics simulation of structural and elastic properties of SiO2–P2O5–Al2O3–Na2O glass"](https://ceramics.onlinelibrary.wiley.com/doi/full/10.1111/jace.18614)

The total time cost:
* LightPFP model generation (section 1 + 2 + 3 + 4): 21 hours
* PFP reference MD (section 5): 12~24 hours depends on PFP Load status
* LightPFP reference MD (section 6): 4 hours

In [None]:
model_version = "v7.0.0"
calc_mode = "crystal_u0"

## Setup

In [None]:
! pip install light-pfp-client==1.0.0 light-pfp-data==1.0.0 light-pfp-evaluate==1.0.0 light-pfp-autogen==0.1.3

## 1. Initial structure

* Define the functions to generate the random structure of SiO2-P2O5-Al2O3-Na2O glass networks.
* Si, P, Al, Na, and O atoms are put into a simulation box with random positions.
* Compositions:
    * SiO2: 79.69-x %
    * P2O5: x %
    * Al2O3: 13.79 %
    * Na2O: 15.52 %
  

In [None]:
from typing import List
import numpy as np
from pathlib import Path
from pfcc_extras.liquidgenerator.liquid_generator import LiquidGenerator
from ase import Atoms
from ase import units
from ase.io import read
from ase.md.npt import NPT
from ase.md.langevin import Langevin

from IPython.display import clear_output


def get_glass(n_Si, n_P, n_Al, n_Na, density=2.0):
    n_O = n_Si * 2 + n_Al//2 * 3 + n_Na//2 + n_P//2 * 5
    composition = []
    composition += [Atoms(symbols=["Si"])] * n_Si
    composition += [Atoms(symbols=["P"])] * n_P
    composition += [Atoms(symbols=["Al"])] * n_Al
    composition += [Atoms(symbols=["Na"])] * n_Na
    composition += [Atoms(symbols=["O"])] * n_O
    liquid_generator = LiquidGenerator(engine="torch", composition=composition, density=density)
    atoms = liquid_generator.run(epochs=100)
    clear_output()
    return atoms
    

def get_random_glass(size="large"):
    assert size in ["small", "large"]
    if size == "small":
        x = np.random.randint(0, 30)
        n_Al = 16
        n_Na = 18
        n_Si = 41-x
        n_P = 2*x
    elif size == "large":
        x = np.random.randint(0, 60)
        n_Al = 32
        n_Na = 36
        n_Si = 82-2*x
        n_P = 4*x
    density = np.random.uniform(2.0, 2.3)
    atoms = get_glass(n_Si, n_P, n_Al, n_Na, density)
    return atoms


## 2. Initial structures
* Download SiO2, Al2O3, Na2O and P2O5 crystal structures from Materials Project for initial dataset generation.

In [None]:
from mp_api.client import MPRester
from pymatgen.io.ase import AseAtomsAdaptor
import os
import numpy as np

def download_mp_structures(api_key, formula, limits=3):
    with MPRester(api_key) as mpr:
        docs = mpr.summary.search(
            formula=formula,
            theoretical=False,
            fields=["material_id", "structure", "energy_above_hull"]
        )
        if len(docs) > limits:
            energy_above_hull = []
            for doc in docs:
                energy_above_hull.append(doc.energy_above_hull)
            ind = np.argsort(energy_above_hull)[:limits]
        else:
            ind = np.arange(len(docs))
        
        atoms_list = []
        mp_id_list = []
        for i in ind:
            doc = docs[i]
            structure = doc.structure
            atoms_list.append(AseAtomsAdaptor.get_atoms(structure))
            mp_id_list.append(doc.material_id)
        return atoms_list, mp_id_list
    

def expand_atoms(atoms, length=10.0):
    par = atoms.get_cell().cellpar()[:3]
    supercell = np.round(length / par, 0).astype(int)
    return atoms * supercell

In [None]:
api_key = "" # Please input your api_key, https://next-gen.materialsproject.org/api

structure_dir = Path("structures")
structure_dir.mkdir(parents=True, exist_ok=True)

all_structures = []

if len(list(structure_dir.glob("*.cif"))) < 3:
    atoms_list, mp_id_list = download_mp_structures(api_key, "SiO2")
    all_structures += atoms_list
    for atoms, mp_id in zip(atoms_list, mp_id_list):
        atoms.write(structure_dir / f"SiO2_{mp_id}.cif")
    
    atoms_list, mp_id_list = download_mp_structures(api_key, "Al2O3")
    all_structures += atoms_list
    for atoms, mp_id in zip(atoms_list, mp_id_list):
        atoms.write(structure_dir / f"Al2O3_{mp_id}.cif")
    
    atoms_list, mp_id_list = download_mp_structures(api_key, "Na2O")
    all_structures += atoms_list
    for atoms, mp_id in zip(atoms_list, mp_id_list):
        atoms.write(structure_dir / f"Na2O_{mp_id}.cif")
    
    atoms_list, mp_id_list = download_mp_structures(api_key, "P2O5")
    all_structures += atoms_list
    for atoms, mp_id in zip(atoms_list, mp_id_list):
        atoms.write(structure_dir / f"P2O5_{mp_id}.cif")
else:
    for f in structure_dir.glob("*.cif"):
        all_structures.append(read(f))

## 2. Initial dataset
* Make dataset with MD sampling
* Initial dataset includes SiO2, Al2O3, Na2O and P2O5 crystals downloaded from MP and 5 additional random structure generated by `get_random_glass` function.

In [None]:
from light_pfp_data.utils.dataset import H5DatasetWriter

init_dataset_dir = Path("init_dataset")
init_dataset_dir.mkdir(parents=True, exist_ok=True)
initial_dataset = init_dataset_dir / "init.h5"

In [None]:
from unittest.mock import patch
from pfp_api_client import Estimator, ASECalculator
from light_pfp_data.sample import sample_md


if initial_dataset.exists():
    print(f"Dataset file {initial_dataset} already exists. Skip generating initial dataset.")
    with patch('builtins.input', return_value='y'):
        dataset = H5DatasetWriter(initial_dataset, mode="append") # automatically input 'y' when running in background
else:
    print(f"Dataset file {initial_dataset} is created. Start generating initial structures.")
    with patch('builtins.input', return_value='y'):
        dataset = H5DatasetWriter(initial_dataset)

    # Initialize estimator and calculator
    estimator = Estimator(model_version=model_version, calc_mode=calc_mode)
    calc = ASECalculator(estimator)
    
    from concurrent.futures import as_completed, ThreadPoolExecutor
    from tqdm.auto import tqdm
    
    futures = []
    pbar = tqdm(desc="Total progress", total=0, leave=True)
    with ThreadPoolExecutor(max_workers=8) as executor:
        for atoms in all_structures:
            futures += sample_md(
                input_structure=expand_atoms(atoms),
                calculator=calc,
                dataset=dataset,
                supercell=(1, 1, 1),
                sampling_temp=[500.0, 1000.0, 1500.0],
                sampling_steps=[2000, 2000, 2000],
                sampling_interval=[100, 100, 100],
                ensemble="nvt",
                executor=executor,
                pbar=pbar
            )
        for _ in range(5):
            futures += sample_md(
                input_structure=get_random_glass("small"),
                calculator=calc,
                dataset=dataset,
                supercell=(1, 1, 1),
                sampling_temp=[500.0, 1000.0, 1500.0],
                sampling_steps=[2000, 2000, 2000],
                sampling_interval=[100, 100, 100],
                ensemble="nvt",
                executor=executor,
                pbar=pbar
            )

In [None]:
dataset.h5.close()

## 3. Active learning
* Run active learning
* The structure is quite diverse (5 elements, random initial position, variable compositions) which makes the LightPFP models have large Force MAE. Accordingly, the sample ceriterion is changed to "dF_min_coef = 8.0" to avoid over-sampling. (The default value "dF_min=1.0" might be to strict for this situation). 

In [None]:
import pathlib
import logging
from light_pfp_autogen.active_learning import ActiveLearning
from light_pfp_autogen.config import ActiveLearningConfig, TrainConfig, SampleConfig, CommonConfig, MTPConfig


logging.basicConfig(level=logging.INFO)

active_learning_config = ActiveLearningConfig(
    task_name = "glass-test",
    work_dir = "./autogen_workdir",
    pfp_model_version = model_version,
    pfp_calc_mode = calc_mode,
    init_dataset = [
        "init_dataset/init.h5",
    ],
    train_config = TrainConfig(
        common_config = CommonConfig(max_forces=50.0),
        mtp_config = MTPConfig(pretrained_model="ALL_ELEMENTS_SMALL_NN_6")
    ),
    sample_config = SampleConfig(
        dE_min_coef = 3.0,
        dE_max_coef = 20.0,
        dF_min_coef = 8.0,
        dF_max = 50.0,
        dS_min_coef = 3.0,
        dS_max_coef = 20.0
    )
)

active_learning = ActiveLearning(active_learning_config)
active_learning.initialize()

* 0 ~ 4 iteration: get a random initial structure and just run NVT MD at 300~1200 K for 2 ps.
* The structure is small.

In [None]:
import numpy as np
from ase import units
from ase.md.langevin import Langevin
from ase.md.npt import NPT
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from light_pfp_data.utils.atoms_utils import convert_atoms_to_upper
from light_pfp_autogen.context import DataCollectionContext


class TemperatureControl:
    def __init__(self, md, temperature, cooling_rate):
        # cooling rate: K/steps
        self.md = md
        self.cooling_rate = cooling_rate
        self.init_step = self.md.nsteps
        self.init_temp = temperature
    
    def __call__(self):
        delta = (self.md.nsteps - self.init_step) * self.cooling_rate
        temp = self.init_temp - delta
        self.md.set_temperature(temperature_K=temp)

    
for i in range(active_learning.iter, 5):
    print(f"Current active iteration: {i}")
    for _ in range(50):
        atoms = get_random_glass("small")
        temperature = np.random.uniform(300.0, 1200.0)
        MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)
        clear_output()
        atoms = convert_atoms_to_upper(atoms)

        md = Langevin(atoms, units.fs, temperature_K=temperature, friction=0.1)
        with DataCollectionContext(md=md, interval=100, max_samples=10):
            md.run(steps=2000)

        clear_output()
    active_learning.update()

* 5~9 iteration: get a random initial structure and then annealing the system temperature from 2000K to 300K with 0.1K/fs cooling rate.
* The structure is large.

In [None]:
for i in range(active_learning.iter, 10):
    print(f"Current active iteration: {i}")
    for _ in range(10):
        atoms = get_random_glass("large")
        MaxwellBoltzmannDistribution(atoms, temperature_K=2000.0)
        clear_output()
        atoms = convert_atoms_to_upper(atoms)

        md = Langevin(atoms, units.fs, temperature_K=2000.0, friction=0.1)
        with DataCollectionContext(md=md, interval=100, max_samples=25):
            md.run(steps=5000)
        
        md = NPT(
            atoms, 
            units.fs, 
            temperature_K=2000.0, 
            externalstress=units.bar,
            mask=np.eye(3),
            ttime=20.0 * units.fs,
            pfactor=2e6 * units.GPa * (units.fs**2)
        )
        with DataCollectionContext(md=md, interval=100, max_samples=50):
            temperature_control = TemperatureControl(
                md, 2000.0, 0.1
            )
            md.attach(temperature_control, interval=10)
            md.run(steps=15000)

        clear_output()
    active_learning.update()

* 10~14 iteration: get a random initial structure and then annealing the system temperature from 2000K to 300K with 0.1K/fs cooling rate. At last, equilibrium the system at 300 K for another 20 ps.
* The structure is large.

In [None]:
for i in range(active_learning.iter, 15):
    print(f"Current active iteration: {i}")
    for _ in range(5):
        atoms = get_random_glass("large")
        MaxwellBoltzmannDistribution(atoms, temperature_K=2000.0)
        clear_output()
        atoms = convert_atoms_to_upper(atoms)

        md = Langevin(atoms, units.fs, temperature_K=2000.0, friction=0.1)
        with DataCollectionContext(md=md, interval=100, max_samples=25):
            md.run(steps=5000)
        
        md = NPT(
            atoms, 
            units.fs, 
            temperature_K=2000.0, 
            externalstress=units.bar,
            mask=np.eye(3),
            ttime=20.0 * units.fs,
            pfactor=2e6 * units.GPa * (units.fs**2)
        )
        with DataCollectionContext(md=md, interval=100, max_samples=50):
            temperature_control = TemperatureControl(
                md, 2000.0, 0.1
            )
            md.attach(temperature_control, interval=10)
            md.run(steps=15000)

        md = NPT(
            atoms, 
            units.fs, 
            temperature_K=500.0, 
            externalstress=units.bar,
            mask=np.eye(3),
            ttime=20.0 * units.fs,
            pfactor=2e6 * units.GPa * (units.fs**2)
        )
        with DataCollectionContext(md=md, interval=100, max_samples=50):
            md.run(steps=20000)
            
        clear_output()
    active_learning.update()

In [None]:
active_learning.print_md_statistics()

# 4. Post active training
* Train a final LightPFP model with all the dataset collected in active learing.
* Pretrained model: ALL_ELEMENTS_LARGE_NN_6

In [None]:
from light_pfp_autogen.utils import submit_training_job, check_training_job_status, estimate_epoch


epoch = estimate_epoch(active_learning.datasets_list, 2)

train_config_dict = {
    "common_config": {
        "total_epoch": epoch,
        "max_forces": 50.0
    },
    "mtp_config": {
        "pretrained_model": "ALL_ELEMENTS_LARGE_NN_6"
    },
}


train_config = TrainConfig.from_dict(
    train_config_dict
)

model_id = submit_training_job(
    train_config,
    active_learning.datasets_list,
    "glass-test-3-final-large",
)

status = check_training_job_status(model_id)
print(f"Training job {model_id} status: {status}")

# 5. Run MD with PFP

**I suggest to run this section in a different notebook because it is independent from the main task**

* To collect evaluate the LightPFP model, we run generate glass structure with 5 different compositions with PFP.
* Melt-quencting method is used.

In [None]:
from pfp_api_client import Estimator, ASECalculator

def md_simulation(atoms, name):
    calc = ASECalculator(Estimator(model_version="v7.0.0", calc_mode="crystal_u0"))
    atoms = atoms * (3,3,3)
    atoms.calc = calc
    MaxwellBoltzmannDistribution(atoms, temperature_K=2000.0)    
    traj = Trajectory(f"pfp_md/{name}.traj", "w", atoms = atoms)
    md = Langevin(atoms, units.fs, temperature_K=2000.0, friction=0.1)
    md.attach(traj, interval=1000)
    md.run(steps=5000)
    
    md = NPT(
        atoms, 
        units.fs, 
        temperature_K=2000.0, 
        externalstress=units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    temperature_control = TemperatureControl(
        md, 2000.0, 0.1
    )
    md.attach(temperature_control, interval=10)
    md.attach(traj, interval=1000)
    md.run(steps=15000)
    
    md = NPT(
        atoms, 
        units.fs, 
        temperature_K=500.0, 
        externalstress=units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    md.attach(traj, interval=1000)
    md.run(steps=20000)

**Note: 5 MD tasks are running in parallel and it might cusume a lot of tokens**

In [None]:
from joblib.parallel import delayed, Parallel


md_init_dir = Path("md_init_struc")
md_init_dir.mkdir(exist_ok=True)

composition_list = [
    [82, 0, 32, 36], 
    [78, 8, 32, 36], 
    [74, 16, 32, 36], 
    [60, 44, 32, 36], 
    [36, 92, 32, 36], 
]

md_init_params = []

for n_Si, n_P, n_Al, n_Na in composition_list:
    name = f"{n_Si}SiO2-{n_Al//2}Al2O3-{n_Na//2}Na2O-{n_P//2}P2O5"
    if (md_init_dir / f"{name}.xyz").is_file():
        atoms = read(md_init_dir / f"{name}.xyz")
    else:
        atoms = get_glass(n_Si, n_P, n_Al, n_Na)
        atoms.write(md_init_dir / f"{name}.xyz")
    md_init_params.append((atoms, name))


md_dir = Path("pfp_md")
md_dir.mkdir(exist_ok=True)

Parallel(n_jobs=5)(delayed(md_simulation)(*params) for params in md_init_params)

# 6. Run MD with LightPFP

* Generate the glass with LightPFP. The compositions and MD protocol are the same as above.

In [None]:
import numpy as np
from pathlib import Path
from ase import Atoms
from ase import units
from ase.io import read, Trajectory
from ase.md.langevin import Langevin
from ase.md.npt import NPT
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from light_pfp_client.estimator import Estimator
from light_pfp_client.ase_calculator import ASECalculator
from pfcc_extras.liquidgenerator.liquid_generator import LiquidGenerator
from IPython.display import clear_output


def md_simulation(atoms, name):
    MaxwellBoltzmannDistribution(atoms, temperature_K=2000.0)    
    traj = Trajectory(f"light_pfp_md_large/{name}.traj", "w", atoms = atoms)
    md = Langevin(atoms, units.fs, temperature_K=2000.0, friction=0.1)
    md.attach(traj, interval=1000)
    md.run(steps=5000)
    
    md = NPT(
        atoms, 
        units.fs, 
        temperature_K=2000.0, 
        externalstress=units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    temperature_control = TemperatureControl(
        md, 2000.0, 0.1
    )
    md.attach(temperature_control, interval=10)
    md.attach(traj, interval=1000)
    md.run(steps=15000)
    
    md = NPT(
        atoms, 
        units.fs, 
        temperature_K=500.0, 
        externalstress=units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    md.attach(traj, interval=1000)
    md.run(steps=20000)

In [None]:
md_init_dir = Path("md_init_struc")
md_init_dir.mkdir(exist_ok=True)

composition_list = [
    [82, 0, 32, 36], 
    [78, 8, 32, 36], 
    [74, 16, 32, 36], 
    [60, 44, 32, 36], 
    [36, 92, 32, 36], 
]

md_init_params = []

for n_Si, n_P, n_Al, n_Na in composition_list:
    name = f"{n_Si}SiO2-{n_Al//2}Al2O3-{n_Na//2}Na2O-{n_P//2}P2O5"
    if (md_init_dir / f"{name}.xyz").is_file():
        atoms = read(md_init_dir / f"{name}.xyz")
    else:
        atoms = get_glass(n_Si, n_P, n_Al, n_Na)
        atoms.write(md_init_dir / f"{name}.xyz")
    md_init_params.append((atoms, name))

In [None]:
md_dir = Path("light_pfp_md_large")
md_dir.mkdir(exist_ok=True)

model_id = "" # The model id of final training
calc = ASECalculator(Estimator(model_id=model_id))

for atoms, name in md_init_params:
    atoms = atoms*(3,3,3)
    atoms.calc = calc
    md_simulation(atoms, name)

# 7. Evaluate

In [None]:
from pathlib import Path

eval_dir = Path("evaluate")
eval_dir.mkdir(exist_ok=True)

composition_list = [
    [82, 0, 32, 36], 
    [78, 8, 32, 36], 
    [74, 16, 32, 36], 
    [60, 44, 32, 36], 
    [36, 92, 32, 36], 
]

name_list = [
    f"{n_Si}SiO2-{n_Al//2}Al2O3-{n_Na//2}Na2O-{n_P//2}P2O5" 
    for n_Si, n_P, n_Al, n_Na in composition_list
]

SiO2_P2O5_ratio = [
    (n_Si) / (n_Si + n_P//2)
    for n_Si, n_P, n_Al, n_Na in composition_list
]

## Density

* Let's compare the density of glass structures generated by PFP and LightPFP.
* My result is as below.

<img src="assets/density.png" width="500">

* The density agree well with the PFP results:
    * Average error: 0.014g/cm^3
    * Max error: 0.028 g/cm^3

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from ase import units
from ase.io import Trajectory

def get_density(atoms):
    return atoms.get_masses().sum() / units.kg / atoms.get_volume() * 1e27

def get_density_traj(traj, last_n_frames=10):
    return np.mean([get_density(atoms) for atoms in traj[-last_n_frames:]])


In [None]:
pfp_density = [get_density_traj(Trajectory(f"pfp_md/{name}.traj")) for name in name_list]
lpfp_density = [get_density_traj(Trajectory(f"light_pfp_md_large/{name}.traj")) for name in name_list]

In [None]:
plt.figure()
plt.plot(SiO2_P2O5_ratio, pfp_density, label="PFP", marker="o")
plt.plot(SiO2_P2O5_ratio, lpfp_density, label="LightPFP", marker="o")
plt.xlabel("n_SiO2 / (n_SiO2 + n_P2O5)")
plt.ylabel("density (g/cm3)")
plt.legend()
plt.savefig(eval_dir / "density.png")

## RDF

* The radial distribution function of 5 glass structures are calculated and compared.
* My results are as below. The result of LightPFP agree well with PFP.

**Composition 1: 82SiO2-16Al2O3-18Na2O-0P2O5**

<img src="assets/rdf_82SiO2-16Al2O3-18Na2O-0P2O5.png" width="500">


**Composition 2: 78SiO2-16Al2O3-18Na2O-4P2O5**

<img src="assets/rdf_78SiO2-16Al2O3-18Na2O-4P2O5.png" width="500">

**Composition 3: 74SiO2-16Al2O3-18Na2O-8P2O5**

<img src="assets/rdf_74SiO2-16Al2O3-18Na2O-8P2O5.png" width="500">

**Composition 4: 60SiO2-16Al2O3-18Na2O-22P2O5**

<img src="assets/rdf_60SiO2-16Al2O3-18Na2O-22P2O5.png" width="500">

**Composition 5: 78SiO2-16Al2O3-18Na2O-4P2O5**

<img src="assets/rdf_36SiO2-16Al2O3-18Na2O-46P2O5.png" width="500">

In [None]:
for name in name_list:
    plot_rdf(
        [300],
        [Trajectory(f"pfp_md/{name}.traj")[-10:]],
        [Trajectory(f"light_pfp_md_large/{name}.traj")[-10:]],
        eval_dir / f"rdf_{name}.png"
    )

## Elastic modulus

* The elastic tensor of the glass is calculated with PFP and LightPFP.
* Because it is a little slow, I only calculated one composition, i.e. 74SiO2-16Al2O3-18Na2O-8P2O5.
* My result is


In [None]:
from ase.io import Trajectory
from matlantis_features.utils.calculators.pfp_api_calculator import pfp_estimator_fn
from light_pfp_client.estimator_fn import light_pfp_estimator_fn
from light_pfp_evaluate import evaluate_elastic

model_id = "" # Please input the model_id of final training
estimator_fn_pfp = pfp_estimator_fn(model_version=model_version, calc_mode=calc_mode)
estimator_fn_light_pfp = light_pfp_estimator_fn(model_id=model_id)

name = "74SiO2-16Al2O3-18Na2O-8P2O5"
atoms = Trajectory(f"pfp_md/{name}.traj")[-1]
evaluate_elastic(atoms, estimator_fn_pfp, estimator_fn_light_pfp, eval_dir / f"elastic_{name}.txt")