In [1]:
from pymatgen.entries.mol_entry import MoleculeEntry
from atomate.qchem.database import QChemCalcDb
from monty.serialization import dumpfn, loadfn

from rxnrep.dataset.electrolyte_utils import check_species, \
    check_bond_species, check_bond_length, check_connectivity, check_bad_rdkit_molecule

In [2]:
def get_db_num_entries(db_file):
    db = QChemCalcDb.from_db_file(db_file, admin=True)
    return db.collection.count_documents({})

In [3]:
def query_db_entries(db_file,
                     num_entries: int = None,
                     environment: str = None):
    """
    Query the molecule document database to pull all the molecules form molecule builder.

    Args:
        db_file: path to a json file storing credentials of the database
        num_entries (int): the number of entries to query, if `None`, get all.
        environment: query value for the environment key in the db. e.g. `smd_thf`. if `None`, ignore it.
        
    Returns:
        A list of molecule document entries.
    """

    num_entries = 0 if num_entries is None else num_entries
    query = {} if environment is None else {"environment": environment}

    db = QChemCalcDb.from_db_file(db_file, admin=True)
    cursor = db.collection.find(query,
                                no_cursor_timeout=True).limit(num_entries)
    entries = [i for i in cursor]

    cursor.close()

    return entries

In [4]:
def filter_mol_entries(entries, verbose=False):
    """
    Filter out some `bad` molecules. 
    """

    not_allowed_species = ['P']

    succeeded = []
    for i, m in enumerate(entries):
        fail, comment = check_species(m, species=not_allowed_species)
        if fail:
            if verbose:
                print(i, comment)
            continue

        fail, comment = check_bond_species(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        fail, comment = check_bond_length(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        fail, comment = check_connectivity(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        fail, comment = check_bad_rdkit_molecule(m)
        if fail:
            if verbose:
                print(i, comment)
            continue

        succeeded.append(m)

    return succeeded

### Query db to get molecule documents 

In [5]:
db_file = "/Users/mjwen/Applications/db_access/sam_db/sam_db_mol_builder.json"
get_db_num_entries(db_file)

16718

In [6]:
mol_docs = query_db_entries(db_file, num_entries=200)

### dump to file 

In [7]:
fname = '/Users/mjwen/Documents/Dataset/rxnrep/mol_entries_n200.json'
mol_entries = [MoleculeEntry.from_molecule_document(d) for d in mol_docs]

# filter
print('Number of mol entries before filting:', len(mol_entries))
mol_entries = filter_mol_entries(mol_entries, verbose=False)
print('Number of mol entries after filting:', len(mol_entries))

# dump to file
dumpfn(mol_entries, fname)

Number of mol entries before filting: 200
Number of mol entries after filting: 172
