Copyright Preferred Networks inc. as contributors to Matlantis contrib project.

# LightPFP: CO2 diffusion in MOF-5

We follow the paper ["Storage and Diffusion of Carbon Dioxide in the Metal Organic Framework MOF-5─A Semi-empirical Molecular Dynamics Study"](https://pubs.acs.org/doi/full/10.1021/acs.jpcb.3c04155)


In [None]:
model_version = "v7.0.0"
calc_mode = "crystal_u0_plus_d3"

## Setup

In [None]:
! pip install light-pfp-client==1.0.0 light-pfp-data==1.0.0 light-pfp-evaluate==1.0.0 light-pfp-autogen==0.1.3

## 1. Initial structure

* We make a sample function to add CO2 gas molecule to the MOF at random position.

In [None]:
import numpy as np
from pathlib import Path
from ase.io import read
from ase.build import molecule

def add_CO2(atoms, min_dist=2.5, max_tries=20):
    n_atoms = len(atoms)
    indices = np.arange(n_atoms, dtype=int)
    trial = 0
    while trial < max_tries:
        atoms_trial = atoms.copy()
        co2 = molecule("CO2")
        co2.positions += np.dot(np.random.random([1,3]), atoms.get_cell()[:])
        atoms_trial += co2
        if (atoms_trial.get_distances(n_atoms, indices, mic=True).min() > min_dist) and (atoms_trial.get_distances(n_atoms+1, indices, mic=True).min() > min_dist) and (atoms_trial.get_distances(n_atoms+2, indices, mic=True).min() > min_dist):
            return atoms_trial
        trial += 1
    raise ValueError


def add_multi_CO2(atoms, n):
    atoms_ = atoms.copy()
    for _ in range(n):
        atoms_ = add_CO2(atoms_)
    return atoms_

* Initial structures are generated
* Single MOF frame with 1, 4, 8 and 16 CO2 molecules.

In [None]:
mof = read("init_structure/MOF5.cif")
mof_1co2 = add_multi_CO2(mof, 1)
mof_4co2 = add_multi_CO2(mof, 4)
mof_8co2 = add_multi_CO2(mof, 8)
mof_16co2 = add_multi_CO2(mof, 16)

## 2.  Initial dataset
* MD sampling

In [None]:
from light_pfp_data.utils.dataset import H5DatasetWriter

init_dataset_dir = Path("init_dataset")
init_dataset_dir.mkdir(parents=True, exist_ok=True)
initial_dataset = init_dataset_dir / "init.h5"

In [None]:
from unittest.mock import patch
from pfp_api_client import Estimator, ASECalculator
from light_pfp_data.sample import sample_md


if initial_dataset.exists():
    print(f"Dataset file {initial_dataset} already exists. Skip generating initial dataset.")
    with patch('builtins.input', return_value='y'):
        dataset = H5DatasetWriter(initial_dataset, mode="append") # automatically input 'y' when running in background
else:
    print(f"Dataset file {initial_dataset} is created. Start generating initial structures.")
    with patch('builtins.input', return_value='y'):
        dataset = H5DatasetWriter(initial_dataset)

    # Initialize estimator and calculator
    estimator = Estimator(model_version=model_version, calc_mode=calc_mode)
    calc = ASECalculator(estimator)
    
    from concurrent.futures import as_completed, ThreadPoolExecutor
    from tqdm.auto import tqdm
    
    futures = []
    pbar = tqdm(desc="Total progress", total=0, leave=True)
    with ThreadPoolExecutor(max_workers=8) as executor:
        for atoms in [mof, mof_1co2, mof_4co2, mof_8co2, mof_16co2]:
            futures += sample_md(
                input_structure=atoms,
                calculator=calc,
                dataset=dataset,
                supercell=(1, 1, 1),
                sampling_temp=[300.0, 500.0, 1000.0, 1500.0],
                sampling_steps=[5000, 5000, 2000, 2000],
                sampling_interval=[100, 100, 100, 100],
                ensemble="nvt",
                executor=executor,
                pbar=pbar
            )
            
        for f in as_completed(futures):
            _ = f.result()

In [None]:
dataset.h5.close()

## 3. Active learning

In [None]:
import pathlib
import logging
from IPython.display import clear_output
from light_pfp_autogen.active_learning import ActiveLearning
from light_pfp_autogen.config import ActiveLearningConfig, TrainConfig, SampleConfig, CommonConfig, MTPConfig


logging.basicConfig(level=logging.INFO)

active_learning_config = ActiveLearningConfig(
    task_name = "mof-co2-test-1",
    work_dir = "./autogen_workdir_1",
    pfp_model_version = model_version,
    pfp_calc_mode = calc_mode,
    init_dataset = [
        "init_dataset/init.h5",
    ],
    train_config = TrainConfig(
        common_config = CommonConfig(max_forces=50.0),
        mtp_config = MTPConfig(pretrained_model="ORGANIC_SMALL_NN")
    )
)

active_learning = ActiveLearning(active_learning_config)
active_learning.initialize()

In [None]:
import numpy as np
from ase import units
from ase.md.npt import NPT
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from light_pfp_autogen.context import DataCollectionContext


for i in range(active_learning.iter, 3):
    print(f"Current active iteration: {i}")
    for _ in range(5):
        n_CO2 = np.random.randint(0, 16)
        atoms = add_multi_CO2(mof, n_CO2)
        temperature = np.random.uniform(200.0, 500.0)
        MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)

        md = NPT(
            atoms, 
            units.fs, 
            temperature_K=temperature, 
            externalstress=units.bar,
            mask=np.eye(3),
            ttime=20.0 * units.fs,
            pfactor=2e6 * units.GPa * (units.fs**2)
        )
        with DataCollectionContext(md=md, interval=100, max_samples=50):
            md.run(steps=20000)

        clear_output()
    active_learning.update()

In [None]:
for i in range(active_learning.iter, 6):
    print(f"Current active iteration: {i}")
    for _ in range(5):
        n_CO2 = np.random.randint(0, 16)
        atoms = add_multi_CO2(mof, n_CO2) * (2,2,2)
        temperature = np.random.uniform(200.0, 500.0)
        MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)

        md = NPT(
            atoms, 
            units.fs, 
            temperature_K=temperature, 
            externalstress=units.bar,
            mask=np.eye(3),
            ttime=20.0 * units.fs,
            pfactor=2e6 * units.GPa * (units.fs**2)
        )
        with DataCollectionContext(md=md, interval=100, max_samples=50):
            md.run(steps=20000)

        clear_output()
    active_learning.update()

In [None]:
active_learning.print_md_statistics()

## 4. Post training

In [None]:
from light_pfp_autogen.utils import submit_training_job, check_training_job_status, estimate_epoch


epoch = estimate_epoch(active_learning.datasets_list, 2.0)

train_config_dict = {
    "common_config": {
        "total_epoch": epoch,
        "max_forces": 50.0
    },
    "mtp_config": {
        "pretrained_model": "ORGANIC_SMALL_NN"
    },
}


train_config = TrainConfig.from_dict(
    train_config_dict
)

model_id = submit_training_job(
    train_config,
    active_learning.datasets_list,
    "mof-co2-test-1-small-final",
)

status = check_training_job_status(model_id)
print(f"Training job {model_id} status: {status}")

## 5. PFP MD simulation for evaluation
* n_CO2: 8
* temperature: 300, 340, 400
* 1 fs/step, 250,000 steps
* 2x2x2 supercell

In [None]:
# def md_simulation(atoms, temperature, name):
#     calc = ASECalculator(Estimator(model_version="v7.0.0", calc_mode="crystal_u0_plus_d3"))
#     atoms.calc = calc
#     MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)    
#     traj = Trajectory(f"pfp_md/{name}.traj", "w", atoms = atoms)

#     md = NPT(
#         atoms, 
#         units.fs, 
#         temperature_K=temperature, 
#         externalstress=units.bar,
#         mask=np.eye(3),
#         ttime=20.0 * units.fs,
#         pfactor=2e6 * units.GPa * (units.fs**2)
#     )
#     md.attach(traj, interval=10000)
#     md.run(steps=250000)

In [None]:
# md_dir = Path("pfp_md")
# md_dir.mkdir(exist_ok=True)

# mof = read("init_structure/MOF5.cif")

# n_CO2_list = [8]
# temperature_list = [300, 340, 400]
# param_list = []

# for n_CO2 in n_CO2_list:
#     for temperature in temperature_list:
#         name = f"MOF_{n_CO2}CO2_{temperature}K"
#         atoms = add_multi_CO2(mof, n_CO2) * (2, 2, 2)
#         param_list.append((atoms, temperature, name))

In [None]:
# from joblib.parallel import delayed, Parallel

# Parallel(n_jobs=3)(delayed(md_simulation)(*params) for params in param_list)

## 6. Light PFP MD simulation for evaluation

In [None]:
import numpy as np
from pathlib import Path
from ase import Atoms
from ase import units
from ase.io import read, Trajectory
from ase.md.npt import NPT
from ase.md.velocitydistribution import MaxwellBoltzmannDistribution
from light_pfp_client.estimator import Estimator
from light_pfp_client.ase_calculator import ASECalculator

In [None]:
def md_simulation(atoms, temperature, name, steps):
    
    MaxwellBoltzmannDistribution(atoms, temperature_K=temperature)    
    traj = Trajectory(f"light_pfp_md/{name}.traj", "w", atoms = atoms)

    md = NPT(
        atoms, 
        units.fs, 
        temperature_K=temperature, 
        externalstress=units.bar,
        mask=np.eye(3),
        ttime=20.0 * units.fs,
        pfactor=2e6 * units.GPa * (units.fs**2)
    )
    md.attach(traj, interval=10000)
    md.run(steps=steps)

In [None]:
md_dir = Path("light_pfp_md")
md_dir.mkdir(exist_ok=True)

calc = ASECalculator(Estimator(model_id=model_id))

mof = read("init_structure/MOF5.cif")

n_CO2_list = [8]
temperature_list = [300, 340, 400]

for n_CO2 in n_CO2_list:
    for temperature in temperature_list:
        name = f"MOF_{n_CO2}CO2_{temperature}K"
        atoms = add_multi_CO2(mof, n_CO2) * (2, 2, 2)
        atoms.calc = calc
        md_simulation(atoms, temperature, name, steps=250000)

## 7. Evaluation

* My results:

**Comparison of mean square displacement**

<img src="assets/msd.png" width="500">


**Comparison of diffison coefficient**

<img src="assets/diffusivity.png" width="500">


**CO2 diffusion coefficent (m^2/s)**
| temperature | PFP | LightPFP |
| - | - | - |
| 300 | 2.519 | 2.567 | 
| 340 | 3.185 | 3.070 |
| 400 | 3.736 | 3.920 |

In [None]:
from pathlib import Path

eval_dir = Path("evaluate")
eval_dir.mkdir(exist_ok=True)

temperature_list = [300, 340, 400]

name_list = [
    f"MOF_8CO2_{temperature}K" 
    for temperature in temperature_list
]

In [None]:
from ase.io import Trajectory
import numpy as np


def get_CO2(atoms):
    n_atoms_cell = len(atoms) // 8
    ind = [j*n_atoms_cell + i for j in range(8) for i in range(424, n_atoms_cell)]
    return atoms[ind]

def get_msd(traj):
    pos = np.array([get_CO2(atoms).get_positions() for atoms in traj])
    msd = [np.mean(np.sum((pos[i+1:] - pos[:-(i+1)])**2, axis=2)) for i in range(len(pos)-1)]
    return msd

def get_diffusion_coef(msd, time_interval):
    time = np.arange(len(msd)) * time_interval
    D = np.polyfit(time, msd, 1)[0] / 6 *1e-5 # cm^2/s
    return D

In [None]:
msd_dict = {}

for calc in ["pfp", "light_pfp"]:
    msd_dict[calc] = {}
    for n in [8]:
        msd_dict[calc][n] = {}
        for temperature in [300, 340, 400]:
            traj_name = f"{calc}_md/MOF_{n}CO2_{temperature}K.traj"
            msd_dict[calc][n][temperature] = get_msd(Trajectory(traj_name))

In [None]:
import matplotlib.pyplot as plt

for n in [8]:
    for temperature, c in zip([300, 340, 400], ["r", "b", "g"]):
        msd = msd_dict["pfp"][n][temperature][:10]
        time = np.arange(len(msd)) * 10
        plt.plot(time, msd, label=f"PFP, {temperature}K", c=c)
        msd = msd_dict["light_pfp"][n][temperature][:10]
        time = np.arange(len(msd)) * 10
        plt.plot(time, msd, label=f"PFP, {temperature}K", c=c, ls="--")

plt.legend()
plt.xlabel("time (ps)")
plt.ylabel("MSD (A^2)")
plt.savefig(eval_dir / "msd.png")

In [None]:
D_dict = {}

for calc in ["pfp", "light_pfp"]:
    D_dict[calc] = {}
    for n in [8]:
        D_dict[calc][n] = {}
        for temperature in [300, 340, 400]:
            msd = msd_dict[calc][n][temperature]
            D_dict[calc][n][temperature] = get_diffusion_coef(msd[:10], time_interval=10000)

In [None]:
categories = list(D_dict['pfp'][8].keys())
pfp_values = list(D_dict['pfp'][8].values())
light_pfp_values = list(D_dict['light_pfp'][8].values())

bar_width = 0.35
r1 = np.arange(len(categories))
r2 = [x + bar_width for x in r1]

plt.figure(figsize=(10, 6))
bar1 = plt.bar(r1, pfp_values, color='b', width=bar_width, edgecolor='grey', label='pfp')
bar2 = plt.bar(r2, light_pfp_values, color='r', width=bar_width, edgecolor='grey', label='light_pfp')

plt.xlabel('Temperature (K)', fontweight='bold')
plt.xticks([r + bar_width/2 for r in range(len(categories))], categories)
plt.ylabel('Diffusion coefficient (m2/s)', fontweight='bold')
plt.title('Diffusion coefficient of CO2 in MOF-5')

plt.legend()
plt.show()
plt.savefig(eval_dir/"diffusivity.png")

## 8. Long MD simulation with LightPFP

In [None]:
md_dir = Path("production")
md_dir.mkdir(exist_ok=True)

calc = ASECalculator(Estimator(model_id=model_id))

mof = read("init_structure/MOF5.cif")

n_CO2_list = [8, 1, 4, 12, 16]
temperature_list = [300, 340, 400, 320, 360, 380]

for n_CO2 in n_CO2_list:
    for temperature in temperature_list:
        name = f"MOF_{n_CO2}CO2_{temperature}K"
        atoms = add_multi_CO2(mof, n_CO2) * (2, 2, 2)
        atoms.calc = calc
        md_simulation(atoms, temperature, name)