Skip to content

Commit

Permalink
Db changes (#214)
Browse files Browse the repository at this point in the history
* Deprecate ValueMongoDb arguments. (#189)

* Deprecated lru_cache_size argument.

* updated tests

* Added mongodb to CI

* Removed unsused variable.

* made the test database name more unique

* Deprecate MoleculeMongoDb arguments. (#190)

* Deprecate ConstructedMoleculeMongoDb arguments. (#191)

* put() should update all matching entries. (#192)

* put() should update all matching entries. (#193)

* put() should update all matching entries. (#194)

* Removed unused mock MongoClient. (#195)

* Added a test to make sure caching is working (#196)

* Added a test to make sure caching is working. (#197)

* Added a test to make sure caching is working. (#198)

* Added utilities for tracking MongoDB state. (#199)

* Added a test to make sure database is updating. (#200)

* Added a test to make sure database is updating. (#201)

* Added a test to make sure database is updating. (#202)
  • Loading branch information
lukasturcani committed Jul 29, 2020
1 parent ebf876d commit e5c267a
Show file tree
Hide file tree
Showing 23 changed files with 1,639 additions and 121 deletions.
2 changes: 2 additions & 0 deletions .travis.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,5 +15,7 @@ install:
- conda env create -f ./tests/environment.yml
- conda activate stk_test

services: mongodb

script:
- pytest
201 changes: 176 additions & 25 deletions src/stk/databases/mongo_db/constructed_molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from functools import lru_cache
import warnings

from stk.serialization import (
ConstructedMoleculeJsonizer,
Expand Down Expand Up @@ -191,9 +192,14 @@ def __init__(
molecule_collection='molecules',
constructed_molecule_collection='constructed_molecules',
position_matrix_collection='position_matrices',
building_block_position_matrix_collection=(
'building_block_position_matrices'
),
jsonizer=ConstructedMoleculeJsonizer(),
dejsonizer=ConstructedMoleculeDejsonizer(),
lru_cache_size=128,
lru_cache_size='',
put_lru_cache_size=128,
get_lru_cache_size=128,
indices=('InChIKey', ),
):
"""
Expand Down Expand Up @@ -221,6 +227,11 @@ def __init__(
matrices of the molecules put into and retrieved from
the database.
building_block_position_matrix_collection : :class:`str`
The name of the collection, which stores the position
matrices of the building blocks of the constructed
molecules put into and retrieved from the database.
jsonizer : :class:`.ConstructedMoleculeJsonizer`
Used to create the JSON representations of molecules
stored in the database.
Expand All @@ -230,9 +241,26 @@ def __init__(
JSON representations.
lru_cache_size : :class:`int`, optional
This argument is deprecated and will be removed in any
version of :mod:`stk` released on, or after, 01/01/21.
Use the `put_lru_cache_size` and `get_lru_cache_size`
arguments instead.
A RAM-based least recently used cache is used to avoid
reading and writing to the database repeatedly. This sets
the number of molecules which fit into the LRU cache. If
the number of values which fit into the LRU cache. If
``None``, the cache size will be unlimited.
put_lru_cache_size : :class:`int`, optional
A RAM-based least recently used cache is used to avoid
writing to the database repeatedly. This sets
the number of values which fit into the LRU cache. If
``None``, the cache size will be unlimited.
get_lru_cache_size : :class:`int`, optional
A RAM-based least recently used cache is used to avoid
reading from the database repeatedly. This sets
the number of values which fit into the LRU cache. If
``None``, the cache size will be unlimited.
indices : :class:`tuple` of :class:`str`, optional
Expand All @@ -241,17 +269,31 @@ def __init__(
"""

if lru_cache_size != '':
warnings.warn(
'The lru_cache_size argument is deprecated and will '
'be removed in any version of stk released on, or '
'after, 01/01/21. Use the put_lru_cache_size and '
'get_lru_cache_size arguments instead.',
FutureWarning,
)
put_lru_cache_size = lru_cache_size
get_lru_cache_size = lru_cache_size

database = mongo_client[database]
self._molecules = database[molecule_collection]
self._constructed_molecules = database[
constructed_molecule_collection
]
self._position_matrices = database[position_matrix_collection]
self._building_block_position_matrices = database[
building_block_position_matrix_collection
]
self._jsonizer = jsonizer
self._dejsonizer = dejsonizer

self._get = lru_cache(maxsize=lru_cache_size)(self._get)
self._put = lru_cache(maxsize=lru_cache_size)(self._put)
self._get = lru_cache(maxsize=get_lru_cache_size)(self._get)
self._put = lru_cache(maxsize=put_lru_cache_size)(self._put)

for index in indices:
# Do not create the same index twice.
Expand All @@ -268,6 +310,16 @@ def __init__(
):
self._position_matrices.create_index(index)

if (
f'{index}_1'
not in
self._building_block_position_matrices
.index_information()
):
self._building_block_position_matrices.create_index(
index,
)

def put(self, molecule):
molecule = molecule.with_canonical_atom_ordering()
json = self._jsonizer.to_json(molecule)
Expand Down Expand Up @@ -300,32 +352,130 @@ def make_hashable(json):
))
return self._put(HashableDict(json))

@staticmethod
def _get_query(json):
keys = dict(json['matrix'])
keys.pop('m')

query = {'$or': []}
for key, value in keys.items():
query['$or'].append({key: value})
return query

def _put(self, json):
# insert_one() corrupts the state of the dict it is passed
# as an argument (it adds various items to it).
# Using insert_one(json['molecule']) would mean that the json
# in the lru_cache is modified with some extra items added by
# insert_one(). This means that the next time _put() is used
# with a clean json, it will not match the one in the cache,
# because the one in the cache has the extra items added by
# insert_one(). To prevent this use
# insert_one(dict(json['molecule'])), which means that a copy
# is modified by insert_one and the json in the cache is
# not changed.

self._position_matrices.insert_one(dict(json['matrix']))
self._molecules.insert_one(dict(json['molecule']))
self._constructed_molecules.insert_one(
document=dict(json['constructedMolecule']),
query = self._get_query(json)
self._molecules.update_many(
filter=query,
update={
'$set': json['molecule'],
},
upsert=True,
)
self._position_matrices.update_many(
filter=query,
update={
'$set': json['matrix'],
},
upsert=True,
)

self._add_building_block_keys_from_database(
query=query,
building_block_keys=json['constructedMolecule']['BB'],
)

self._constructed_molecules.update_many(
filter=query,
update={
'$set': json['constructedMolecule'],
},
upsert=True,
)
for building_block_json in json['buildingBlocks']:
self._molecules.insert_one(
document=dict(building_block_json['molecule']),
building_block_query = self._get_query(building_block_json)
self._molecules.update_many(
filter=building_block_query,
update={
'$set': building_block_json['molecule'],
},
upsert=True,
)
self._position_matrices.insert_one(
document=dict(building_block_json['matrix']),
self._building_block_position_matrices.update_many(
filter=building_block_query,
update={
'$set': building_block_json['matrix'],
},
upsert=True,
)

def _add_building_block_keys_from_database(
self,
query,
building_block_keys,
):
"""
Add previously deposited keys to `building_block_keys`.
Checks the constructed molecule collection to find all
constructed molecule entries which match `query`. All matches
should merely be duplicate entries for the same constructed
molecule.
Each entry for the constructed molecule will have a
:class:`list` of building blocks, which were used to
construct the constructed molecule.
A building block is represented in this :class:`list` through
a :class:`dict`, which maps the name of a molecular key
(like "SMILES" or "InChIKey") to the appropriate value for that
building block.
The database may have multiple different dictionaries for
the same building block, because each dictionary may hold
different molecular keys. These differing dictionaries will be
spread across the constructed molecule entries.
The various different dictionaries, which all represent
the same building block, are merged by this method. The merged
dictionary is the one held in `building_block_keys`, which is
updated in-place.
This means that when `building_block_keys` is used to replace
an entry in the database, it does not remove any building
block keys already there.
Parameters
----------
query : :class:`dict`
A query which matches entries, corresponding to a
single constructed molecule.
building_block_keys : :class:`list` of :class:`dict`
Each :class:`dict` represents a building block of the
constructed molecule matched by `query`. The :class:`dict`
holds the name of a molecular key and its value for that
particular building block. Key-value pairs for building
block molecular keys already found in the database are
added to the dictionaries by this method.
Returns
-------
None : :class:`NoneType`
"""

database_building_block_keys = (
molecule_entry['BB']
for molecule_entry
in self._constructed_molecules.find(query)
)
for entry_building_block_keys in database_building_block_keys:
for keys1, keys2 in zip(
building_block_keys,
entry_building_block_keys,
):
keys1.update(keys2)

def get(self, key):
# lru_cache requires that the parameters to the cached function
# are hashable objects.
Expand Down Expand Up @@ -386,5 +536,6 @@ def _get(self, key):
def _get_building_block(self, key):
return {
'molecule': self._molecules.find_one(key),
'matrix': self._position_matrices.find_one(key),
'matrix':
self._building_block_position_matrices.find_one(key),
}
74 changes: 56 additions & 18 deletions src/stk/databases/mongo_db/molecule.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
"""

from functools import lru_cache
import warnings

from stk.serialization import (
MoleculeJsonizer,
Expand Down Expand Up @@ -187,7 +188,9 @@ def __init__(
position_matrix_collection='position_matrices',
jsonizer=MoleculeJsonizer(),
dejsonizer=MoleculeDejsonizer(),
lru_cache_size=128,
lru_cache_size='',
put_lru_cache_size=128,
get_lru_cache_size=128,
indices=('InChIKey', ),
):
"""
Expand Down Expand Up @@ -219,9 +222,26 @@ def __init__(
JSON representations.
lru_cache_size : :class:`int`, optional
This argument is deprecated and will be removed in any
version of :mod:`stk` released on, or after, 01/01/21.
Use the `put_lru_cache_size` and `get_lru_cache_size`
arguments instead.
A RAM-based least recently used cache is used to avoid
reading and writing to the database repeatedly. This sets
the number of molecules which fit into the LRU cache. If
the number of values which fit into the LRU cache. If
``None``, the cache size will be unlimited.
put_lru_cache_size : :class:`int`, optional
A RAM-based least recently used cache is used to avoid
writing to the database repeatedly. This sets
the number of values which fit into the LRU cache. If
``None``, the cache size will be unlimited.
get_lru_cache_size : :class:`int`, optional
A RAM-based least recently used cache is used to avoid
reading from the database repeatedly. This sets
the number of values which fit into the LRU cache. If
``None``, the cache size will be unlimited.
indices : :class:`tuple` of :class:`str`, optional
Expand All @@ -230,14 +250,25 @@ def __init__(
"""

if lru_cache_size != '':
warnings.warn(
'The lru_cache_size argument is deprecated and will '
'be removed in any version of stk released on, or '
'after, 01/01/21. Use the put_lru_cache_size and '
'get_lru_cache_size arguments instead.',
FutureWarning,
)
put_lru_cache_size = lru_cache_size
get_lru_cache_size = lru_cache_size

database = mongo_client[database]
self._molecules = database[molecule_collection]
self._position_matrices = database[position_matrix_collection]
self._jsonizer = jsonizer
self._dejsonizer = dejsonizer

self._get = lru_cache(maxsize=lru_cache_size)(self._get)
self._put = lru_cache(maxsize=lru_cache_size)(self._put)
self._get = lru_cache(maxsize=get_lru_cache_size)(self._get)
self._put = lru_cache(maxsize=put_lru_cache_size)(self._put)

for index in indices:
# Do not create the same index twice.
Expand All @@ -262,20 +293,27 @@ def put(self, molecule):
return self._put(HashableDict(json))

def _put(self, json):
# insert_one() corrupts the state of the dict it is passed
# as an argument (it adds various items to it).
# Using insert_one(json['molecule']) would mean that the json
# in the lru_cache is modified with some extra items added by
# insert_one(). This means that the next time _put() is used
# with a clean json, it will not match the one in the cache,
# because the one in the cache has the extra items added by
# insert_one(). To prevent this use
# insert_one(dict(json['molecule'])), which means that a copy
# is modified by insert_one and the json in the cache is
# not changed.

self._molecules.insert_one(dict(json['molecule']))
self._position_matrices.insert_one(dict(json['matrix']))
keys = dict(json['matrix'])
keys.pop('m')

query = {'$or': []}
for key, value in keys.items():
query['$or'].append({key: value})

self._molecules.update_many(
filter=query,
update={
'$set': json['molecule'],
},
upsert=True,
)
self._position_matrices.update_many(
filter=query,
update={
'$set': json['matrix'],
},
upsert=True,
)

def get(self, key):
# lru_cache requires that the parameters to the cached function
Expand Down
Loading

0 comments on commit e5c267a

Please sign in to comment.