Db changes (#214)

* Deprecate ValueMongoDb arguments. (#189) * Deprecated lru_cache_size argument. * updated tests * Added mongodb to CI * Removed unsused variable. * made the test database name more unique * Deprecate MoleculeMongoDb arguments. (#190) * Deprecate ConstructedMoleculeMongoDb arguments. (#191) * put() should update all matching entries. (#192) * put() should update all matching entries. (#193) * put() should update all matching entries. (#194) * Removed unused mock MongoClient. (#195) * Added a test to make sure caching is working (#196) * Added a test to make sure caching is working. (#197) * Added a test to make sure caching is working. (#198) * Added utilities for tracking MongoDB state. (#199) * Added a test to make sure database is updating. (#200) * Added a test to make sure database is updating. (#201) * Added a test to make sure database is updating. (#202)
lukasturcani · Jul 29, 2020 · e5c267a · e5c267a
1 parent ebf876d
commit e5c267a
Show file tree

Hide file tree

Showing 23 changed files with 1,639 additions and 121 deletions.
diff --git a/.travis.yml b/.travis.yml
@@ -15,5 +15,7 @@ install:
     - conda env create -f ./tests/environment.yml
     - conda activate stk_test
 
+services: mongodb
+
 script:
     - pytest
diff --git a/src/stk/databases/mongo_db/constructed_molecule.py b/src/stk/databases/mongo_db/constructed_molecule.py
@@ -5,6 +5,7 @@
 """
 
 from functools import lru_cache
+import warnings
 
 from stk.serialization import (
     ConstructedMoleculeJsonizer,
@@ -191,9 +192,14 @@ def __init__(
         molecule_collection='molecules',
         constructed_molecule_collection='constructed_molecules',
         position_matrix_collection='position_matrices',
+        building_block_position_matrix_collection=(
+            'building_block_position_matrices'
+        ),
         jsonizer=ConstructedMoleculeJsonizer(),
         dejsonizer=ConstructedMoleculeDejsonizer(),
-        lru_cache_size=128,
+        lru_cache_size='',
+        put_lru_cache_size=128,
+        get_lru_cache_size=128,
         indices=('InChIKey', ),
     ):
         """
@@ -221,6 +227,11 @@ def __init__(
             matrices of the molecules put into and retrieved from
             the database.
 
+        building_block_position_matrix_collection : :class:`str`
+            The name of the collection, which stores the position
+            matrices of the building blocks of the constructed
+            molecules put into and retrieved from the database.
+
         jsonizer : :class:`.ConstructedMoleculeJsonizer`
             Used to create the JSON representations of molecules
             stored in the database.
@@ -230,9 +241,26 @@ def __init__(
             JSON representations.
 
         lru_cache_size : :class:`int`, optional
+            This argument is deprecated and will be removed in any
+            version of :mod:`stk` released on, or after, 01/01/21.
+            Use the `put_lru_cache_size` and `get_lru_cache_size`
+            arguments instead.
+
             A RAM-based least recently used cache is used to avoid
             reading and writing to the database repeatedly. This sets
-            the number of molecules which fit into the LRU cache. If
+            the number of values which fit into the LRU cache. If
+            ``None``, the cache size will be unlimited.
+
+        put_lru_cache_size : :class:`int`, optional
+            A RAM-based least recently used cache is used to avoid
+            writing to the database repeatedly. This sets
+            the number of values which fit into the LRU cache. If
+            ``None``, the cache size will be unlimited.
+
+        get_lru_cache_size : :class:`int`, optional
+            A RAM-based least recently used cache is used to avoid
+            reading from the database repeatedly. This sets
+            the number of values which fit into the LRU cache. If
             ``None``, the cache size will be unlimited.
 
         indices : :class:`tuple` of :class:`str`, optional
@@ -241,17 +269,31 @@ def __init__(
 
         """
 
+        if lru_cache_size != '':
+            warnings.warn(
+                'The lru_cache_size argument is deprecated and will '
+                'be removed in any version of stk released on, or '
+                'after, 01/01/21. Use the put_lru_cache_size and '
+                'get_lru_cache_size arguments instead.',
+                FutureWarning,
+            )
+            put_lru_cache_size = lru_cache_size
+            get_lru_cache_size = lru_cache_size
+
         database = mongo_client[database]
         self._molecules = database[molecule_collection]
         self._constructed_molecules = database[
             constructed_molecule_collection
         ]
         self._position_matrices = database[position_matrix_collection]
+        self._building_block_position_matrices = database[
+            building_block_position_matrix_collection
+        ]
         self._jsonizer = jsonizer
         self._dejsonizer = dejsonizer
 
-        self._get = lru_cache(maxsize=lru_cache_size)(self._get)
-        self._put = lru_cache(maxsize=lru_cache_size)(self._put)
+        self._get = lru_cache(maxsize=get_lru_cache_size)(self._get)
+        self._put = lru_cache(maxsize=put_lru_cache_size)(self._put)
 
         for index in indices:
             # Do not create the same index twice.
@@ -268,6 +310,16 @@ def __init__(
             ):
                 self._position_matrices.create_index(index)
 
+            if (
+                f'{index}_1'
+                not in
+                self._building_block_position_matrices
+                .index_information()
+            ):
+                self._building_block_position_matrices.create_index(
+                    index,
+                )
+
     def put(self, molecule):
         molecule = molecule.with_canonical_atom_ordering()
         json = self._jsonizer.to_json(molecule)
@@ -300,32 +352,130 @@ def make_hashable(json):
         ))
         return self._put(HashableDict(json))
 
+    @staticmethod
+    def _get_query(json):
+        keys = dict(json['matrix'])
+        keys.pop('m')
+
+        query = {'$or': []}
+        for key, value in keys.items():
+            query['$or'].append({key: value})
+        return query
+
     def _put(self, json):
-        # insert_one() corrupts the state of the dict it is passed
-        # as an argument (it adds various items to it).
-        # Using insert_one(json['molecule']) would mean that the json
-        # in the lru_cache is modified with some extra items added by
-        # insert_one(). This means that the next time _put() is used
-        # with a clean json, it will not match the one in the cache,
-        # because the one in the cache has the extra items added by
-        # insert_one(). To prevent this use
-        # insert_one(dict(json['molecule'])), which means that a copy
-        # is modified by insert_one and the json in the cache is
-        # not changed.
-
-        self._position_matrices.insert_one(dict(json['matrix']))
-        self._molecules.insert_one(dict(json['molecule']))
-        self._constructed_molecules.insert_one(
-            document=dict(json['constructedMolecule']),
+        query = self._get_query(json)
+        self._molecules.update_many(
+            filter=query,
+            update={
+                '$set': json['molecule'],
+            },
+            upsert=True,
+        )
+        self._position_matrices.update_many(
+            filter=query,
+            update={
+                '$set': json['matrix'],
+            },
+            upsert=True,
+        )
+
+        self._add_building_block_keys_from_database(
+            query=query,
+            building_block_keys=json['constructedMolecule']['BB'],
+        )
+
+        self._constructed_molecules.update_many(
+            filter=query,
+            update={
+                '$set': json['constructedMolecule'],
+            },
+            upsert=True,
         )
         for building_block_json in json['buildingBlocks']:
-            self._molecules.insert_one(
-                document=dict(building_block_json['molecule']),
+            building_block_query = self._get_query(building_block_json)
+            self._molecules.update_many(
+                filter=building_block_query,
+                update={
+                    '$set': building_block_json['molecule'],
+                },
+                upsert=True,
             )
-            self._position_matrices.insert_one(
-                document=dict(building_block_json['matrix']),
+            self._building_block_position_matrices.update_many(
+                filter=building_block_query,
+                update={
+                    '$set': building_block_json['matrix'],
+                },
+                upsert=True,
             )
 
+    def _add_building_block_keys_from_database(
+        self,
+        query,
+        building_block_keys,
+    ):
+        """
+        Add previously deposited keys to `building_block_keys`.
+
+        Checks the constructed molecule collection to find all
+        constructed molecule entries which match `query`. All matches
+        should merely be duplicate entries for the same constructed
+        molecule.
+
+        Each entry for the constructed molecule will have a
+        :class:`list` of building blocks, which were used to
+        construct the constructed molecule.
+
+        A building block is represented in this :class:`list` through
+        a :class:`dict`, which maps the name of a molecular key
+        (like "SMILES" or "InChIKey") to the appropriate value for that
+        building block.
+
+        The database may have multiple different dictionaries for
+        the same building block, because each dictionary may hold
+        different molecular keys. These differing dictionaries will be
+        spread across the constructed molecule entries.
+
+        The various different dictionaries, which all represent
+        the same building block, are merged by this method. The merged
+        dictionary is the one held in `building_block_keys`, which is
+        updated in-place.
+
+        This means that when `building_block_keys` is used to replace
+        an entry in the database, it does not remove any building
+        block keys already there.
+
+        Parameters
+        ----------
+        query : :class:`dict`
+            A query which matches entries, corresponding to a
+            single constructed molecule.
+
+        building_block_keys : :class:`list` of :class:`dict`
+            Each :class:`dict` represents a building block of the
+            constructed molecule matched by `query`. The :class:`dict`
+            holds the name of a molecular key and its value for that
+            particular building block. Key-value pairs for building
+            block molecular keys already found in the database are
+            added to the dictionaries by this method.
+
+        Returns
+        -------
+        None : :class:`NoneType`
+
+        """
+
+        database_building_block_keys = (
+            molecule_entry['BB']
+            for molecule_entry
+            in self._constructed_molecules.find(query)
+        )
+        for entry_building_block_keys in database_building_block_keys:
+            for keys1, keys2 in zip(
+                building_block_keys,
+                entry_building_block_keys,
+            ):
+                keys1.update(keys2)
+
     def get(self, key):
         # lru_cache requires that the parameters to the cached function
         # are hashable objects.
@@ -386,5 +536,6 @@ def _get(self, key):
     def _get_building_block(self, key):
         return {
             'molecule': self._molecules.find_one(key),
-            'matrix': self._position_matrices.find_one(key),
+            'matrix':
+                self._building_block_position_matrices.find_one(key),
         }
diff --git a/src/stk/databases/mongo_db/molecule.py b/src/stk/databases/mongo_db/molecule.py
@@ -5,6 +5,7 @@
 """
 
 from functools import lru_cache
+import warnings
 
 from stk.serialization import (
     MoleculeJsonizer,
@@ -187,7 +188,9 @@ def __init__(
         position_matrix_collection='position_matrices',
         jsonizer=MoleculeJsonizer(),
         dejsonizer=MoleculeDejsonizer(),
-        lru_cache_size=128,
+        lru_cache_size='',
+        put_lru_cache_size=128,
+        get_lru_cache_size=128,
         indices=('InChIKey', ),
     ):
         """
@@ -219,9 +222,26 @@ def __init__(
             JSON representations.
 
         lru_cache_size : :class:`int`, optional
+            This argument is deprecated and will be removed in any
+            version of :mod:`stk` released on, or after, 01/01/21.
+            Use the `put_lru_cache_size` and `get_lru_cache_size`
+            arguments instead.
+
             A RAM-based least recently used cache is used to avoid
             reading and writing to the database repeatedly. This sets
-            the number of molecules which fit into the LRU cache. If
+            the number of values which fit into the LRU cache. If
+            ``None``, the cache size will be unlimited.
+
+        put_lru_cache_size : :class:`int`, optional
+            A RAM-based least recently used cache is used to avoid
+            writing to the database repeatedly. This sets
+            the number of values which fit into the LRU cache. If
+            ``None``, the cache size will be unlimited.
+
+        get_lru_cache_size : :class:`int`, optional
+            A RAM-based least recently used cache is used to avoid
+            reading from the database repeatedly. This sets
+            the number of values which fit into the LRU cache. If
             ``None``, the cache size will be unlimited.
 
         indices : :class:`tuple` of :class:`str`, optional
@@ -230,14 +250,25 @@ def __init__(
 
         """
 
+        if lru_cache_size != '':
+            warnings.warn(
+                'The lru_cache_size argument is deprecated and will '
+                'be removed in any version of stk released on, or '
+                'after, 01/01/21. Use the put_lru_cache_size and '
+                'get_lru_cache_size arguments instead.',
+                FutureWarning,
+            )
+            put_lru_cache_size = lru_cache_size
+            get_lru_cache_size = lru_cache_size
+
         database = mongo_client[database]
         self._molecules = database[molecule_collection]
         self._position_matrices = database[position_matrix_collection]
         self._jsonizer = jsonizer
         self._dejsonizer = dejsonizer
 
-        self._get = lru_cache(maxsize=lru_cache_size)(self._get)
-        self._put = lru_cache(maxsize=lru_cache_size)(self._put)
+        self._get = lru_cache(maxsize=get_lru_cache_size)(self._get)
+        self._put = lru_cache(maxsize=put_lru_cache_size)(self._put)
 
         for index in indices:
             # Do not create the same index twice.
@@ -262,20 +293,27 @@ def put(self, molecule):
         return self._put(HashableDict(json))
 
     def _put(self, json):
-        # insert_one() corrupts the state of the dict it is passed
-        # as an argument (it adds various items to it).
-        # Using insert_one(json['molecule']) would mean that the json
-        # in the lru_cache is modified with some extra items added by
-        # insert_one(). This means that the next time _put() is used
-        # with a clean json, it will not match the one in the cache,
-        # because the one in the cache has the extra items added by
-        # insert_one(). To prevent this use
-        # insert_one(dict(json['molecule'])), which means that a copy
-        # is modified by insert_one and the json in the cache is
-        # not changed.
-
-        self._molecules.insert_one(dict(json['molecule']))
-        self._position_matrices.insert_one(dict(json['matrix']))
+        keys = dict(json['matrix'])
+        keys.pop('m')
+
+        query = {'$or': []}
+        for key, value in keys.items():
+            query['$or'].append({key: value})
+
+        self._molecules.update_many(
+            filter=query,
+            update={
+                '$set': json['molecule'],
+            },
+            upsert=True,
+        )
+        self._position_matrices.update_many(
+            filter=query,
+            update={
+                '$set': json['matrix'],
+            },
+            upsert=True,
+        )
 
     def get(self, key):
         # lru_cache requires that the parameters to the cached function