In [2]:
from __future__ import annotations
import os
from tqdm import tqdm
import warnings

from monty.serialization import loadfn
from pymongo import MongoClient

warnings.simplefilter('ignore')

In [3]:
client = MongoClient()
db = client["matpes"]

In [4]:
def make_db(functional):
    """
    Creates and populates a MongoDB with materials structure data for a specified functional.

    The method reads a JSON file containing training data for the given `functional`, processes the to extract and augment relevant atomic and chemical information, and inserts the processed data into a corresponding MongoDB collection. Additionally, indexes are created on key fields to improve query performance.

    Args:
        functional (str): The name of the functional. This is used to locate
            the appropriate file for loading the data and also defines the MongoDB collection name.
    
    Raises:
        FileNotFoundError: If the designated file does not exist or cannot be accessed.

    Example:
        To create the database and indexes for a given functional 'pbe', run:
        
        ```python
        make_db("pbe")
        ```

    Data Pipeline:
    -------------
    1. Load Data:
        - Reads data from a gzipped JSON file for the specified `functional`.
    
    2. Extract & Process Fields:
        - Each dataset entry is extracted and processed to include information such as:
            - `matpesid`: A unique identifier for the material.
            - `natoms`: Number of atoms in the structure.
            - `elements`: List of distinct chemical elements in the material.
            - `nelements`: Number of distinct elements.
            - `chemsys`: Chemical system (e.g., 'H-O').
            - `formula`: Reduced chemical formula of the material.
            - `composition`: Dictionary depicting the element counts in the structure.
            - `formation_energy_per_atom`: Energy per atom (derived from `formation_energy`).
            - `structure`: The structure in dictionary format.
    
    3. Store Data in MongoDB:
        - Deletes any existing records in the collection corresponding to `functional`.
        - Inserts the processed records.
    
    4. Create Indexes: 
        - Indexes are created on the following fields to optimize searching:
            - `natoms`
            - `elements`
            - `nelements`
            - `chemsys`
            - `formula`
            - `matpesid`

    MongoDB Collection Structure:
    -----------------------------
    Each record in the MongoDB collection has the following fields:

    - matpesid: str
        Unique identifier for the material in the MatPES data.
    - natoms: int
        Number of atoms in the structure.
    - elements: List[str]
        List of chemical elements in the structure.
    - nelements: int
        Number of distinct chemical elements.
    - chemsys:
        String representation of the elements in the chemical system, 
        sorted alphabetically (e.g., 'H-O').
    - formula: str
        The reduced chemical formula of the material (e.g., 'H2O').
    - composition: Dict[str, float]
        A dictionary representing the element-to-amount mapping in the structure (e.g., `{"H": 2, "O": 1}`).
    - formation_energy_per_atom: float
        Formation energy per atom for the material (extracted from `formation_energy`).
    - structure: dict
        The detailed structure of the material in dictionary format.
    
    Indexes:
    --------
    The created MongoDB indexes optimize the following fields:
    
    - `natoms`: Number of atoms per structure.
    - `elements`: Chemical elements present in the structure.
    - `nelements`: Number of distinct elements in the structure.
    - `chemsys`: Chemical system, e.g., 'H-O' for water.
    - `formula`: Reduced chemical formula.
    - `matpesid`: Unique identifier for the material.

    Notes:
    ------
    - Ensure a valid MongoDB connection (`db`) is available and correctly set up before calling this function.
    - The JSON file path is specific to the user's system configuration.

    """
    
    raw = loadfn(os.path.expanduser(f"~/Desktop/2024_11_18_MatPES-20240214-{functional}-training-data.json.gz"))
    data = []
    
    for k, v in tqdm(raw.items()):
        # Combine IDs and structure information
        d = {"matpesid": k} | v
        comp = d["structure"].composition
        
        # Populate additional fields based on composition
        d["natoms"] = len(d["structure"])
        d["elements"] = list(comp.chemical_system_set)
        d["nelements"] = len(d["elements"])
        d["chemsys"] = comp.chemical_system
        d["formula"] = comp.reduced_formula
        d["composition"] = {el.symbol: amt for el, amt in comp.items()}
        d["structure"] = d["structure"].as_dict()
        
        # Restructure formation energy data
        d["formation_energy_per_atom"] = d["formation_energy"]
        del d["formation_energy"]
        
        # Add processed entry to list
        data.append(d)
    
    # Get collection from DB and clear old data
    collection = db[functional]
    collection.delete_many({})
    
    # new data
    collection.insert_many(data)
    
    # Create indexes for optimized query performance
    for k in ["natoms", "elements", "nelements", "chemsys", "formula", "matpesid"]:
        collection.create_index(k)


In [None]:
make_db("PBE")

 62%|█████████████████████████████████████████████████▎                             | 271161/434712 [01:07<01:16, 2149.16it/s]

In [None]:
make_db("r2SCAN")