In [None]:
import os
import dotenv
from tqdm import tqdm
import numpy as np
import pandas as pd
from mp_api.client import MPRester


dotenv.load_dotenv()
mp_api_key = os.getenv("MP_API_KEY")

In [None]:
with MPRester(mp_api_key) as mpr:
    docs = mpr.materials.summary.search(
        num_sites=[0, 120],
        energy_above_hull=[0, 0.1],
        theoretical=False,
        fields=[
            "material_id",
            "structure",
            "energy_above_hull",
            "band_gap",
            "theoretical",
        ],
    )
print(f"Found {len(docs)} materials")

In [None]:
excluded_gas_list = [
    "H",
    "He",
    "N",
    "O",
    "F",
    "Ne",
    "Cl",
    "Ar",
    "Kr",
    "Xe",
    "Rn",
    "Fr",
    "Og",
]

In [None]:
data = []
for doc in tqdm(docs):
    st = doc.structure
    assert st.num_sites <= 120
    elements = [elmt.symbol for elmt in st.composition.elements]

    if len(elements) == 1 and elements[0] in excluded_gas_list:
        continue

    if max(st.lattice.abc) > 50:
        print(st.formula, st.lattice.abc)
        continue

    row = {
        "material_id": doc.material_id,
        "energy_above_hull": doc.energy_above_hull,
        "band_gap": doc.band_gap,
        "cif": st.to(fmt="cif"),
    }
    data.append(row)

In [None]:
df_mp = pd.DataFrame(data)
df_mp = df_mp.drop_duplicates(subset="material_id")
df_mp = df_mp.sample(frac=1, random_state=42).reset_index().drop(columns="index")
# split train/val/test
train_size = int(len(df_mp) * 0.8)
val_size = int(len(df_mp) * 0.1)
df_mp_train = df_mp[:train_size]
df_mp_val = df_mp[train_size : train_size + val_size]
df_mp_test = df_mp[train_size + val_size :]
print(f"Train: {len(df_mp_train)}, Val: {len(df_mp_val)}, Test: {len(df_mp_test)}")
df_mp_train.to_csv("train.csv", index=False)
df_mp_val.to_csv("val.csv", index=False)
df_mp_test.to_csv("test.csv", index=False)