# LightPFP: Interfaces between anatase TiO2 and FCC Ni

**Note: Please run the `pfp_md.ipynb` at the same time. The `pfp_md.ipynb` will generate several MD trajectories for the validation of LightPFP models. As these tasks are totally independent from this notebok, please run both notebok in parellel.**

In [None]:
model_version="v7.0.0"
calc_mode="crystal_u0"
task_name = "ni-tio2"

## 1. Initial structures

The initial structures for dataset generation includes both the perfect crystal structure of Ni and TiO2 and the interface structures generated with a random cutting-concating method.

* Crystal Ni and TiO2 are downloaded from materials project database.
  * Anatase TiO2: mp-390
  * Ni: mp-23
* Interface structures are generated with the following method

  <img src='./inputs/illustration.png' width=400>

In [None]:
# Download from mp

from mp_api.client import MPRester
from pymatgen.io.ase import AseAtomsAdaptor


def download_structure(mp_id, api_key):
    """
    Downloads the structure from Materials Project for the given MP-ID.

    Parameters:
        mp_id (str): The Materials Project ID of the structure.
        api_key (str): Your Materials Project API key.

    Returns:
        structure (pymatgen.core.structure.Structure): The pymatgen Structure object.
    """
    # Initialize MPRester with your API key
    
    with MPRester(api_key) as m:
        try:
            # Get the structure by MP-ID
            structure = m.get_structure_by_material_id(mp_id, conventional_unit_cell=True)
            atoms = AseAtomsAdaptor.get_atoms(structure)
            print(f"Successfully downloaded structure for MP-ID: {mp_id}")
            return atoms
        except Exception as e:
            print(f"Error downloading structure for MP-ID {mp_id}: {e}")

In [None]:
api_key = "" # Your Materials Project API KEY
anatase_tio2 = download_structure("mp-390", api_key)
ni = download_structure("mp-23", api_key)

In [None]:
from utils import make_cubic_cut
from ase.filters import FrechetCellFilter
from ase.optimize.fire import FIRE
from pfp_api_client import ASECalculator, Estimator


def make_any_gb(atoms_1, atoms_2, length, cut1=2.0, cut2=3.0, margin=1.5):
    """
    Make a very simple random grain boundary-like structures in the following way:
    1. Cut a cubic structure from a large supercell of 'atoms_1' with random orientation.
    2. Cut a cubic structure from a large supercell of 'atoms_2' with random orientation.
    3. Concatenate two cubes togather with a gap of 'margin'.
    """
    atoms_cut_1 = make_cubic_cut(atoms_1, length, cut1)
    atoms_cut_2 = make_cubic_cut(atoms_2, length, cut2)
    atoms_cut_1.positions[:,2] -= atoms_cut_1.positions[:,2].min()
    top_1 = atoms_cut_1.positions[:,2].max()
    atoms_cut_2.positions[:,2] += top_1 + margin - atoms_cut_2.positions[:,2].min()
    top_2 = atoms_cut_2.positions[:,2].max()
    atoms_gb = atoms_cut_1 + atoms_cut_2
    atoms_gb.set_cell([length, length, top_2 + margin])
    return atoms_gb


def make_any_gb_w_opt(atoms_1, atoms_2, length, cut1=2.0, cut2=2.0, margin=1.5, steps=100, fmax=5.0):
    """
    Make a very simple random grain boundary-like structures and optimize with PFP
    """
    atoms_gb = make_any_gb(atoms_1, atoms_2, length, cut1, cut2, margin)
    calc = ASECalculator(Estimator(model_version=model_version, calc_mode=calc_mode))
    atoms_gb.calc = calc
    fcf = FrechetCellFilter(atoms_gb)
    fire = FIRE(fcf)
    fire.run(steps = steps, fmax=fmax)
    return atoms_gb

In [None]:
init_gb_structures = [make_any_gb_w_opt(anatase_tio2, ni, 15) for _ in range(10)]

## 2. Initial training dataset

* MD and rattle sampling methods are used for dataset collections.
* The initial structures are the crystal and interface structure generated in the above section.

In [None]:
from pathlib import Path
from concurrent.futures import as_completed, ThreadPoolExecutor
import numpy as np
from h5py import File
from tqdm.auto import tqdm

from pfp_api_client import Estimator, ASECalculator
from light_pfp_data.utils.dataset import H5DatasetWriter
from light_pfp_data.sample.crystal import sample_md, sample_rattle, sample_compress, sample_deformed, sample_vacancy, sample_surface, sample_displaced
from utils import expand_supercell


# Create a folder for the initial dataset if it does not exist.
init_dataset_dir = Path("init_dataset")
init_dataset_dir.mkdir(parents=True, exist_ok=True)

# Define the initial dataset file path.
initial_dataset = init_dataset_dir / "init.h5"

# Check for existence of dataset file; if exists, load it. Otherwise, sample to generate new data.
if initial_dataset.exists():
    print(f"Dataset file {initial_dataset} already exists. Skipping dataset generation.")
    dataset = H5DatasetWriter(File(initial_dataset))
else:
    print(f"Dataset file {initial_dataset} does not exist. Starting initial dataset sampling.")
    dataset = H5DatasetWriter(initial_dataset)
    estimator = Estimator(model_version=model_version, calc_mode=calc_mode)
    calc = ASECalculator(estimator)
    
    # For multithreaded execution.
    futures = []
    pbar = tqdm(desc="Total progress", total=0, leave=True)
    
    # "initial_structures" is assumed to be available from the previous initial structure generation step.
    # It contains several n-decane initial configurations.
    with ThreadPoolExecutor(max_workers=16) as executor:
        for atoms in init_gb_structures:
            # MD sampling with NVT ensemble at high temperatures for robust coverage.
            # Temperatures: 500K, 1000K, 1500K
            # For each temperature, run 5000 steps and sample one structure every 100 steps.
            futures += sample_md(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                supercell=(1,1,1),
                sampling_temp=[1000.0, 3000.0, 5000.0],
                sampling_steps=[5000, 5000, 5000],
                sampling_interval=[100, 100, 100],
                ensemble="nvt",
                executor=executor,
                pbar=pbar
            )
            # MD sampling with NPT ensemble at target conditions.
            # Temperatures: 300K, 400K, 500K; Pressure: 1.0 bar.
            futures += sample_md(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                supercell=(1,1,1),
                sampling_temp=[1000.0, 3000.0, 5000.0],
                sampling_pressure=[1.0, 1.0, 1.0],
                sampling_steps=[5000, 5000, 5000],
                sampling_interval=[100, 100, 100],
                ensemble="npt",
                executor=executor,
                pbar=pbar
            )
            futures += sample_rattle(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                stdev=0.1,
                n_sample=10,
                supercell=(1, 1, 1),
                executor=executor,
                pbar=pbar
            )
        for atoms in [anatase_tio2, ni]:
            # MD sampling with NVT ensemble at high temperatures for robust coverage.
            # Temperatures: 500K, 1000K, 1500K
            # For each temperature, run 5000 steps and sample one structure every 100 steps.
            atoms = expand_supercell(atoms)
            futures += sample_md(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                supercell=(1,1,1),
                sampling_temp=[1000.0, 3000.0, 5000.0],
                sampling_steps=[2000, 2000, 2000],
                sampling_interval=[100, 100, 100],
                ensemble="nvt",
                executor=executor,
                pbar=pbar
            )
            # MD sampling with NPT ensemble at target conditions.
            # Temperatures: 300K, 400K, 500K; Pressure: 1.0 bar.
            futures += sample_md(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                supercell=(1,1,1),
                sampling_temp=[500.0, 1500.0, 2500.0],
                sampling_pressure=[1.0, 1.0, 1.0],
                sampling_steps=[2000, 2000, 2000],
                sampling_interval=[100, 100, 100],
                ensemble="npt",
                executor=executor,
                pbar=pbar
            )
            futures += sample_rattle(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                stdev=0.1,
                n_sample=10,
                supercell=(1, 1, 1),
                executor=executor,
                pbar=pbar
            )
    
    
    # Wait for all sampling tasks to complete.
    for future in as_completed(futures):
        _ = future.result()

# Close the dataset file.
dataset.h5.close()

## 3. Active learning

* Initialize the active learning task

In [None]:
import logging
from light_pfp_autogen.active_learning import ActiveLearning
from light_pfp_autogen.config import ActiveLearningConfig, TrainConfig, SampleConfig, CommonConfig, MTPConfig


# Configure logging to show active learning process details.
logging.basicConfig(level=logging.INFO)

active_learning_config = ActiveLearningConfig(
    task_name=task_name,
    work_dir="autogen_workdir",
    pfp_model_version=model_version,
    pfp_calc_mode=calc_mode,
    init_dataset=["init_dataset/init.h5"],
    training_time=1.0,
    train_config=TrainConfig(
        common_config=CommonConfig(max_forces=50.0),
        mtp_config=MTPConfig(pretrained_model="ALL_ELEMENTS_SMALL_NN_6")
    ),
    sample_config=SampleConfig(
        dE_min_coef=3.0,
        dE_max_coef=20.0,
        dF_min_coef=8.0,
        dF_max=50.0,
        dS_min_coef=3.0,
        dS_max_coef=20.0,
    )
)

# Initialize the active learning task with the specified configuration.
active_learning = ActiveLearning(active_learning_config)
active_learning.initialize()

* Define the MD script for active learning

In [None]:
import numpy as np
from ase import units
from ase.md.langevin import Langevin
from ase.md.nvtberendsen import NVTBerendsen
from ase.md.npt import NPT
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from ase.optimize.fire import FIRE
from ase.filters import FrechetCellFilter
from IPython.display import clear_output
from light_pfp_autogen.context import DataCollectionContext
from ase import Atoms


def convert_atoms_to_upper(atoms: Atoms) -> Atoms:
    rotated_atoms = atoms.copy()
    # cell "c" -> z-axis
    rotated_atoms.rotate(rotated_atoms.cell[2], (0, 0, 1), rotate_cell=True)
    
    # cell "b" -> yz-plane
    bx, by, bz = rotated_atoms.cell[1, :]
    angle = 90.0 - np.rad2deg(np.arctan2(by, bx))
    rotated_atoms.rotate(angle, "z", rotate_cell=True)
    # [Note] cell "a" can be arbitrary.

    # suppress numerical precision, lower triangular values must be 0.0.
    m = rotated_atoms.get_cell()[:]
    if not m[1, 0] == m[2, 0] == m[2, 1] == 0.0:
        m[1,0] = 0.0
        m[2,0] = 0.0
        m[2,1] = 0.0
        rotated_atoms.set_cell(m)
    return rotated_atoms


def active_learning_protocol(atoms, temperature, steps_npt, steps_nvt):
    """
    Define a MD and optimization workflow which will be repeately used in active learning.
    1. NVT MD, 2000 steps
    2. NPT MD, `steps_npt` steps
    3. NVT MD at 500 K higher temperature, `steps_nvt` steps.
    4. Structure optimization, 200 steps

    Parameters:
        atoms: The initial structure.
        temperature: The temperature of MD simulation.
        steps_npt: The number of steps for the NPT MD at `temperature` K.
        steps_nvt: The number of steps for the NVT MD at `temperature+500` K.
    """
    atoms = convert_atoms_to_upper(atoms)
    # Initialize atomic velocities according to Maxwell-Boltzmann distribution.
    MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)
    
    # Run a short NVT MD simulation (e.g., using Langevin dynamics) to equilibrate the structure.
    md = Langevin(atoms, 1.0 * units.fs, temperature_K=temperature, friction=0.1)
    with DataCollectionContext(md=md, interval=100):
        md.run(steps=2000)
    
    # Then run a longer NPT MD simulation to generate diverse training structures.
    md = NPT(
        atoms,
        1.0 * units.fs,
        temperature_K=temperature,
        externalstress=1.0 * units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    with DataCollectionContext(md=md, interval=100):
        md.run(steps=steps_npt)

    # Run MD at evaluated temperature
    md = Langevin(atoms, 1.0 * units.fs, temperature_K=temperature+500.0, friction=0.1)
    with DataCollectionContext(md=md, interval=100):
        md.run(steps=steps_nvt)
    
    fire = FIRE(atoms)
    with DataCollectionContext(md=fire, interval=100):
        fire.run(fmax=0.0, steps=200)
    
    clear_output()


In [None]:
# Iteration 0~4: Small structures (15x15x30 A)
for i in range(active_learning.iter, 5):
    print(f"Active learning iteration: {i} (small structure)")
    # Repeat GB structure generation and MD/OPT for 20 times
    for _ in range(10):
        temperature = np.random.uniform(500, 2000)
        atoms = make_any_gb_w_opt(anatase_tio2, ni, 15)
        print(f"Running MD for GB: size = {len(atoms)}, temperature = {temperature:.1f} K")
        active_learning_protocol(atoms, temperature, steps_npt=15000, steps_nvt=5000)
    active_learning.update()


# Iteration 5~9: Large structures (20x20x40 A)
for i in range(active_learning.iter, 10):
    print(f"Active learning iteration: {i} (large structure)")
    # Repeat GB structure generation and MD/OPT for 20 times
    for _ in range(5):
        temperature = np.random.uniform(500, 2000)
        atoms = make_any_gb_w_opt(anatase_tio2, ni, 20)
        print(f"Running MD for GB: size = {len(atoms)}, temperature = {temperature:.1f} K")
        active_learning_protocol(atoms, temperature, steps_npt=30000, steps_nvt=10000)
    active_learning.update()

active_learning.print_md_statistics()

## Final stage training

* Training the model for more epochs with initial dataset and all datasets generated from active learning.
* We submit two jobs at the same time: one small models and one large models

In [None]:
from light_pfp_autogen.utils import submit_training_job, check_training_job_status, estimate_epoch

epoch = estimate_epoch(active_learning.datasets_list, 4)

# Train small model
train_config_dict = {
    "common_config": {
        "total_epoch": epoch,
        "max_forces": 50.0
    },
    "mtp_config": {
        "pretrained_model": "ALL_ELEMENTS_SMALL_NN_6"
    },
}

training_config = TrainConfig.from_dict(
    train_config_dict
)

model_id_small = submit_training_job(
    training_config,
    active_learning.datasets_list,
    "ni_tio2",
)

# Train large model
train_config_dict = {
    "common_config": {
        "total_epoch": epoch,
        "max_forces": 50.0
    },
    "mtp_config": {
        "pretrained_model": "ALL_ELEMENTS_LARGE_NN_6"
    },
}

training_config = TrainConfig.from_dict(
    train_config_dict
)

model_id_large = submit_training_job(
    training_config,
    active_learning.datasets_list,
    "ni_tio2_large",
)

## 5. Run MD with LightPFP

* Define the MD script

In [None]:
import numpy as np
from time import perf_counter
from ase import units
from ase.io import Trajectory
from ase.md.langevin import Langevin
from ase.md.nvtberendsen import NVTBerendsen
from ase.md.npt import NPT
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution


class PrintDyn:
    def __init__(self, dyn, logfile=None):
        self.dyn = dyn
        self.st = perf_counter()
        self.logfile = logfile
        if self.logfile is not None:
            with open(self.logfile, "w") as fd:
                fd.write("# step E_tot E_pot density T elapsed_time\n")
    def __call__(self):
        dyn = self.dyn
        atoms = dyn.atoms
        msg = (
            f"{dyn.get_number_of_steps(): 6d} {atoms.get_total_energy():.3f} {atoms.get_potential_energy():.3f} "
            f"{atoms.get_masses().sum() / units.kg / atoms.get_volume() * 1e27:.5f} "
            f"{atoms.get_temperature():.1f} {perf_counter() - self.st:.2f}"
        )
        print(msg)
        if self.logfile is not None:
            with open(self.logfile, "a") as fd:
                fd.write(msg+"\n")
                

def md_protocol(atoms, temperature, steps_nvt, steps_npt, traj):
    """
    Define a MD task
    1. NVT MD, `steps_nvt` steps
    2. NPT MD, `steps_npt` steps

    Parameters:
        atoms: The initial structure.
        temperature: The temperature of MD simulation.
        steps_nvt: The number of steps for the NVT MD at `temperature` K.
        steps_npt: The number of steps for the NPT MD at `temperature` K.
        traj: The path to save MD trajectory.
    """
    traj = Trajectory(traj, "w", atoms=atoms)
    # Initialize atomic velocities according to Maxwell-Boltzmann distribution.
    MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)
    
    # Run a short NVT MD simulation (e.g., using Langevin dynamics) to equilibrate the structure.
    md = Langevin(atoms, 1.0 * units.fs, temperature_K=temperature, friction=0.1)
    print_info = PrintDyn(md)
    md.attach(print_info, interval=100)
    md.attach(traj.write, interval=100)
    md.run(steps=steps_nvt)
    
    # Then run a longer NPT MD simulation to generate diverse training structures.
    md = NPT(
        atoms,
        1.0 * units.fs,
        temperature_K=temperature,
        externalstress=1.0 * units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    print_info = PrintDyn(md)
    md.attach(print_info, interval=100)
    md.attach(traj.write, interval=100)
    md.run(steps=steps_npt)


### 5.1 MD with small LightPFP model

* 6 interface structures generated by Pymatgen are used as the initial structure for MD.
  * The low miller index surface are used here and Pymatgen tried to search the structure with low strain.
  * The interface structures are not included in the training datasets
* MD tasks are run at 1200, 1400, and 1600K

In [None]:
from pathlib import Path
from ase.io import read
from light_pfp_client import ASECalculator, Estimator
from light_pfp_data.utils.atoms_utils import convert_atoms_to_upper


# Here will block the process until the training of SMALL model is finished
status = check_training_job_status(model_id_small)
print(f"Training job {model_id_small} status: {status}")

workdir = "light_pfp_md_small"

md_dir = Path(workdir)
md_dir.mkdir(exist_ok=True)

calc = ASECalculator(Estimator(model_id=model_id_small))

for task in ["anatase_TiO2_Ni_0", "anatase_TiO2_Ni_1", "anatase_TiO2_Ni_2", "anatase_TiO2_Ni_3", "anatase_TiO2_Ni_4", "anatase_TiO2_Ni_5"]: 
    for temperature in [1200, 1400, 1600]:
        atoms = read(f"inputs/{task}.xyz")
        atoms = convert_atoms_to_upper(atoms)
        atoms.calc = calc
        md_protocol(atoms, temperature, 5000, 100000, md_dir/f"light_pfp_md_{task}_{temperature}.traj")

### 5.2 MD with large LightPFP model

* The same MD tasks are run as above except changing the large LightPFP model

In [None]:
from pathlib import Path
from ase.io import read
from light_pfp_client import ASECalculator, Estimator
from light_pfp_data.utils.atoms_utils import convert_atoms_to_upper


status = check_training_job_status(model_id_large)
print(f"Training job {model_id_large} status: {status}")

workdir = "light_pfp_md_large"

md_dir = Path(workdir)
md_dir.mkdir(exist_ok=True)

calc = ASECalculator(Estimator(model_id=model_id_large))

for task in ["anatase_TiO2_Ni_0", "anatase_TiO2_Ni_1", "anatase_TiO2_Ni_2", "anatase_TiO2_Ni_3", "anatase_TiO2_Ni_4", "anatase_TiO2_Ni_5"]: 
    for temperature in [1200, 1400, 1600]:
        atoms = read(f"structures/{task}.xyz")
        atoms = convert_atoms_to_upper(atoms)
        atoms.calc = calc
        md_protocol(atoms, temperature, 5000, 100000, md_dir/f"light_pfp_md_{task}_{temperature}.traj")

**After finished this notebook and the `pfp_md.ipynb` (Generate the MD trajectory as PFP for the comparision and valiation of LightPFP models), please go ahead to run the `validate.ipynb`, where the "density", "radial distribution function" and "diffusion behavior" will be compared between the LightPFP trajectories and PFP trajectories**