In [1]:
from pymatgen.entries.mol_entry import MoleculeEntry, MoleculeEntryError
from atomate.qchem.database import QChemCalcDb
from monty.serialization import dumpfn, loadfn

from rxnrep.dataset.electrolyte_utils import check_species, check_connectivity, \
    check_bond_species, check_bond_length, check_num_bonds, check_bad_rdkit_molecule, \
    remove_high_energy_entries

from typing import List

In [2]:
def get_db_num_entries(db_file):
    db = QChemCalcDb.from_db_file(db_file, admin=True)
    return db.collection.count_documents({})

In [3]:
def query_db_entries(db_file,
                     num_entries: int = None,
                     environment: str = None):
    """
    Query the molecule document database to pull all the molecules form molecule builder.

    Args:
        db_file: path to a json file storing credentials of the database
        num_entries (int): the number of entries to query, if `None`, get all.
        environment: query value for the environment key in the db. e.g. `smd_thf`. if `None`, ignore it.
        
    Returns:
        A list of molecule document entries.
    """

    num_entries = 0 if num_entries is None else num_entries
    query = {} if environment is None else {"environment": environment}

    db = QChemCalcDb.from_db_file(db_file, admin=True)
    cursor = db.collection.find(query,
                                no_cursor_timeout=True).limit(num_entries)
    entries = [i for i in cursor]

    cursor.close()

    return entries

In [4]:
def filter_mol_entries(entries:List[MoleculeEntry], verbose=False)->List[MoleculeEntry]:
    """
    Filter out some `bad` molecules. 
    """

    not_allowed_species = ['P']

    succeeded = []
    for i, m in enumerate(entries):
        
        # remove mols having atoms not connected to others 
        fail, comment = check_connectivity(m)
        if fail:
            if verbose:
                print(i, comment)
            continue
        
        # remove mols with specific species
        fail, comment = check_species(m, species=not_allowed_species)
        if fail:
            if verbose:
                print(i, comment)
            continue

        # remove mols with specific bond between species, e.g. Li-H
        fail, comment = check_bond_species(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        # remove mols with larger bond length
        fail, comment = check_bond_length(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        # remove mols with unexpected number of bonds (e.g. more than 4 bonds for carbon),
        # excluding metal species
        fail, comment = check_num_bonds(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        # remove mols that cannot be converted to rdkit mol
        fail, comment = check_bad_rdkit_molecule(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        succeeded.append(m)

    # for molecules with the same isomorphism and charge, remove the ones with higher free energy
    print('Number of entries before removing isomorphic ones:', len(succeeded))
    succeeded = remove_high_energy_entries(succeeded)
    print('Number of entries after removing isomorphic ones:', len(succeeded))

    return succeeded

### Query db to get molecule documents 

In [5]:
db_file = "/Users/mjwen/Applications/db_access/sam_db/sam_db_mol_builder.json"
get_db_num_entries(db_file)

16718

In [6]:
#num_entries = 2000
num_entries = None
mol_docs = query_db_entries(db_file, num_entries)

### dump to file 

In [7]:
num_failed = 0
mol_entries = []
for doc in mol_docs:
    try:
        entry = MoleculeEntry.from_molecule_document(doc)
        mol_entries.append(entry)
    except MoleculeEntryError:
        num_failed += 1
print('Number of mol doc failed to be converted to mol entry:', num_failed)

# filter
print('Number of mol entries before filting:', len(mol_entries))
mol_entries = filter_mol_entries(mol_entries, verbose=False)
print('Number of mol entries after filting:', len(mol_entries))

# dump to file
fname = f'/Users/mjwen/Documents/Dataset/rxnrep/mol_entries_n{num_entries}.json'
dumpfn(mol_entries, fname)

Number of mol doc failed to be converted to mol entry: 64
Number of mol entries before filting: 16654


RDKit ERROR: [15:41:19] Explicit valence for atom # 3 C, 5, is greater than permitted
RDKit ERROR: [15:41:52] Explicit valence for atom # 2 C, 5, is greater than permitted
RDKit ERROR: [15:42:09] Explicit valence for atom # 3 C, 5, is greater than permitted


Number of entries before removing isomorphic ones: 13116
Number of entries after removing isomorphic ones: 7288
Number of mol entries after filting: 7288
