lsst · TallJimbo · Nov 11, 2020 · Nov 10, 2020
diff --git a/python/lsst/daf/butler/registry/_registry.py b/python/lsst/daf/butler/registry/_registry.py
@@ -412,7 +412,8 @@ def deleteOpaqueData(self, tableName: str, **where: Any) -> None:
         """
         self._opaque[tableName].delete(**where)
 
-    def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED) -> None:
+    def registerCollection(self, name: str, type: CollectionType = CollectionType.TAGGED,
+                           doc: Optional[str] = None) -> None:
         """Add a new collection if one with the given name does not exist.
 
         Parameters
@@ -421,13 +422,15 @@ def registerCollection(self, name: str, type: CollectionType = CollectionType.TA
             The name of the collection to create.
         type : `CollectionType`
             Enum value indicating the type of collection to create.
+        doc : `str`, optional
+            Documentation string for the collection.
 
         Notes
         -----
         This method cannot be called within transactions, as it needs to be
         able to perform its own transaction to be concurrent.
         """
-        self._collections.register(name, type)
+        self._collections.register(name, type, doc=doc)
 
     def getCollectionType(self, name: str) -> CollectionType:
         """Return an enumeration value indicating the type of the given
@@ -450,20 +453,22 @@ def getCollectionType(self, name: str) -> CollectionType:
         """
         return self._collections.find(name).type
 
-    def registerRun(self, name: str) -> None:
+    def registerRun(self, name: str, doc: Optional[str] = None) -> None:
         """Add a new run if one with the given name does not exist.
 
         Parameters
         ----------
         name : `str`
             The name of the run to create.
+        doc : `str`, optional
+            Documentation string for the collection.
 
         Notes
         -----
         This method cannot be called within transactions, as it needs to be
         able to perform its own transaction to be concurrent.
         """
-        self._collections.register(name, CollectionType.RUN)
+        self._collections.register(name, CollectionType.RUN, doc=doc)
 
     @transactional
     def removeCollection(self, name: str) -> None:
@@ -554,6 +559,35 @@ def setCollectionChain(self, parent: str, children: Any) -> None:
         if children != record.children:
             record.update(self._collections, children)
 
+    def getCollectionDocumentation(self, collection: str) -> Optional[str]:
+        """Retrieve the documentation string for a collection.
+
+        Parameters
+        ----------
+        name : `str`
+            Name of the collection.
+
+        Returns
+        -------
+        docs : `str` or `None`
+            Docstring for the collection with the given name.
+        """
+        return self._collections.getDocumentation(self._collections.find(collection).key)
+
+    def setCollectionDocumentation(self, collection: str, doc: Optional[str]) -> None:
+        """Set the documentation string for a collection.
+
+        Parameters
+        ----------
+        name : `str`
+            Name of the collection.
+        docs : `str` or `None`
+            Docstring for the collection with the given name; will replace any
+            existing docstring.  Passing `None` will remove any existing
+            docstring.
+        """
+        self._collections.setDocumentation(self._collections.find(collection).key, doc)
+
     def registerDatasetType(self, datasetType: DatasetType) -> bool:
         """
         Add a new `DatasetType` to the Registry.

diff --git a/python/lsst/daf/butler/registry/collections/_base.py b/python/lsst/daf/butler/registry/collections/_base.py
@@ -360,14 +360,15 @@ def refresh(self) -> None:
         for chain in chains:
             chain.refresh(self)
 
-    def register(self, name: str, type: CollectionType) -> CollectionRecord:
+    def register(self, name: str, type: CollectionType, doc: Optional[str] = None) -> CollectionRecord:
         # Docstring inherited from CollectionManager.
         record = self._getByName(name)
         if record is None:
             row, _ = self._db.sync(
                 self._tables.collection,
                 keys={"name": name},
                 compared={"type": int(type)},
+                extra={"doc": doc},
                 returning=[self._collectionIdName],
             )
             assert row is not None
@@ -425,6 +426,21 @@ def __getitem__(self, key: Any) -> CollectionRecord:
     def __iter__(self) -> Iterator[CollectionRecord]:
         yield from self._records.values()
 
+    def getDocumentation(self, key: Any) -> Optional[str]:
+        # Docstring inherited from CollectionManager.
+        sql = sqlalchemy.sql.select(
+            [self._tables.collection.columns.doc]
+        ).select_from(
+            self._tables.collection
+        ).where(
+            self._tables.collection.columns[self._collectionIdName] == key
+        )
+        return self._db.query(sql).scalar()
+
+    def setDocumentation(self, key: Any, doc: Optional[str]) -> None:
+        # Docstring inherited from CollectionManager.
+        self._db.update(self._tables.collection, {self._collectionIdName: "key"}, {"key": key, "doc": doc})
+
     def _setRecordCache(self, records: Iterable[CollectionRecord]) -> None:
         """Set internal record cache to contain given records,
         old cached records will be removed.

diff --git a/python/lsst/daf/butler/registry/collections/nameKey.py b/python/lsst/daf/butler/registry/collections/nameKey.py
@@ -49,7 +49,7 @@
 
 
 # This has to be updated on every schema change
-_VERSION = VersionTuple(1, 0, 0)
+_VERSION = VersionTuple(2, 0, 0)
 
 
 def _makeTableSpecs(tsRepr: Type[TimespanDatabaseRepresentation]) -> CollectionTablesTuple:
@@ -58,6 +58,7 @@ def _makeTableSpecs(tsRepr: Type[TimespanDatabaseRepresentation]) -> CollectionT
             fields=[
                 _KEY_FIELD_SPEC,
                 ddl.FieldSpec("type", dtype=sqlalchemy.SmallInteger, nullable=False),
+                ddl.FieldSpec("doc", dtype=sqlalchemy.Text, nullable=True),
             ],
         ),
         run=makeRunTableSpec("name", sqlalchemy.String, tsRepr),

diff --git a/python/lsst/daf/butler/registry/collections/synthIntKey.py b/python/lsst/daf/butler/registry/collections/synthIntKey.py
@@ -51,7 +51,7 @@
 
 
 # This has to be updated on every schema change
-_VERSION = VersionTuple(1, 0, 0)
+_VERSION = VersionTuple(2, 0, 0)
 
 
 def _makeTableSpecs(tsRepr: Type[TimespanDatabaseRepresentation]) -> CollectionTablesTuple:
@@ -61,6 +61,7 @@ def _makeTableSpecs(tsRepr: Type[TimespanDatabaseRepresentation]) -> CollectionT
                 _KEY_FIELD_SPEC,
                 ddl.FieldSpec("name", dtype=sqlalchemy.String, length=64, nullable=False),
                 ddl.FieldSpec("type", dtype=sqlalchemy.SmallInteger, nullable=False),
+                ddl.FieldSpec("doc", dtype=sqlalchemy.Text, nullable=True),
             ],
             unique=[("name",)],
         ),

diff --git a/python/lsst/daf/butler/registry/interfaces/_collections.py b/python/lsst/daf/butler/registry/interfaces/_collections.py
@@ -404,7 +404,7 @@ def refresh(self) -> None:
         raise NotImplementedError()
 
     @abstractmethod
-    def register(self, name: str, type: CollectionType) -> CollectionRecord:
+    def register(self, name: str, type: CollectionType, doc: Optional[str] = None) -> CollectionRecord:
         """Ensure that a collection of the given name and type are present
         in the layer this manager is associated with.
 
@@ -414,6 +414,9 @@ def register(self, name: str, type: CollectionType) -> CollectionRecord:
             Name of the collection.
         type : `CollectionType`
             Enumeration value indicating the type of collection.
+        doc : `str`, optional
+            Documentation string for the collection.  Ignored if the collection
+            already exists.
 
         Returns
         -------
@@ -532,3 +535,32 @@ def __iter__(self) -> Iterator[CollectionRecord]:
             The record for a managed collection.
         """
         raise NotImplementedError()
+
+    @abstractmethod
+    def getDocumentation(self, key: Any) -> Optional[str]:
+        """Retrieve the documentation string for a collection.
+
+        Parameters
+        ----------
+        key
+            Internal primary key value for the collection.
+
+        Returns
+        -------
+        docs : `str` or `None`
+            Docstring for the collection with the given key.
+        """
+        raise NotImplementedError()
+
+    @abstractmethod
+    def setDocumentation(self, key: Any, doc: Optional[str]) -> None:
+        """Set the documentation string for a collection.
+
+        Parameters
+        ----------
+        key
+            Internal primary key value for the collection.
+        docs : `str`, optional
+            Docstring for the collection with the given key.
+        """
+        raise NotImplementedError()
diff --git a/python/lsst/daf/butler/registry/tests/_registry.py b/python/lsst/daf/butler/registry/tests/_registry.py
@@ -446,6 +446,11 @@ def testCollections(self):
         self.loadData(registry, "datasets.yaml")
         run1 = "imported_g"
         run2 = "imported_r"
+        # Test setting a collection docstring after it has been created.
+        registry.setCollectionDocumentation(run1, "doc for run1")
+        self.assertEqual(registry.getCollectionDocumentation(run1), "doc for run1")
+        registry.setCollectionDocumentation(run1, None)
+        self.assertIsNone(registry.getCollectionDocumentation(run1))
         datasetType = "bias"
         # Find some datasets via their run's collection.
         dataId1 = {"instrument": "Cam1", "detector": 1}
@@ -456,7 +461,8 @@ def testCollections(self):
         self.assertIsNotNone(ref2)
         # Associate those into a new collection,then look for them there.
         tag1 = "tag1"
-        registry.registerCollection(tag1, type=CollectionType.TAGGED)
+        registry.registerCollection(tag1, type=CollectionType.TAGGED, doc="doc for tag1")
+        self.assertEqual(registry.getCollectionDocumentation(tag1), "doc for tag1")
         registry.associate(tag1, [ref1, ref2])
         self.assertEqual(registry.findDataset(datasetType, dataId1, collections=tag1), ref1)
         self.assertEqual(registry.findDataset(datasetType, dataId2, collections=tag1), ref2)

diff --git a/python/lsst/daf/butler/transfers/_context.py b/python/lsst/daf/butler/transfers/_context.py
@@ -237,7 +237,8 @@ def _finish(self) -> None:
             for run in datasetsByRun.keys():
                 self._collections[run] = self._registry._collections.find(run)
         for collectionName in self._computeSortedCollections():
-            self._backend.saveCollection(self._collections[collectionName])
+            doc = self._registry.getCollectionDocumentation(collectionName)
+            self._backend.saveCollection(self._collections[collectionName], doc)
         # Sort the dataset types and runs before exporting to ensure
         # reproducible order in export file.
         for datasetType in sorted(self._datasets.keys()):

diff --git a/python/lsst/daf/butler/transfers/_interfaces.py b/python/lsst/daf/butler/transfers/_interfaces.py
@@ -73,7 +73,7 @@ def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -
         raise NotImplementedError()
 
     @abstractmethod
-    def saveCollection(self, record: CollectionRecord) -> None:
+    def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
         """Export a collection.
 
         This only exports the collection's own state, not its associations with
@@ -83,6 +83,8 @@ def saveCollection(self, record: CollectionRecord) -> None:
         ----------
         record: `CollectionRecord`
             Object representing the collection to export.
+        doc : `str` or `None`
+            Documentation string for the collection.
         """
         raise NotImplementedError()
 

diff --git a/python/lsst/daf/butler/transfers/_yaml.py b/python/lsst/daf/butler/transfers/_yaml.py
@@ -68,7 +68,7 @@
 from ._interfaces import RepoExportBackend, RepoImportBackend
 
 
-EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 0)
+EXPORT_FORMAT_VERSION = VersionTuple(1, 0, 1)
 """Export format version.
 
 Files with a different major version or a newer minor version cannot be read by
@@ -98,13 +98,15 @@ def saveDimensionData(self, element: DimensionElement, *data: DimensionRecord) -
             "records": data_dicts,
         })
 
-    def saveCollection(self, record: CollectionRecord) -> None:
+    def saveCollection(self, record: CollectionRecord, doc: Optional[str]) -> None:
         # Docstring inherited from RepoExportBackend.saveCollections.
         data: Dict[str, Any] = {
             "type": "collection",
             "collection_type": record.type.name,
             "name": record.name,
         }
+        if doc is not None:
+            data["doc"] = doc
         if isinstance(record, RunRecord):
             data["host"] = record.host
             data["timespan_begin"] = record.timespan.begin
@@ -219,6 +221,7 @@ def __init__(self, stream: IO, registry: Registry):
         self.runs: Dict[str, Tuple[Optional[str], Timespan]] = {}
         self.chains: Dict[str, List[str]] = {}
         self.collections: Dict[str, CollectionType] = {}
+        self.collectionDocs: Dict[str, str] = {}
         self.datasetTypes: NamedValueSet[DatasetType] = NamedValueSet()
         self.dimensions: Mapping[DimensionElement, List[DimensionRecord]] = defaultdict(list)
         self.tagAssociations: Dict[str, List[int]] = defaultdict(list)
@@ -264,6 +267,9 @@ def __init__(self, stream: IO, registry: Registry):
                     self.chains[data["name"]] = children
                 else:
                     self.collections[data["name"]] = collectionType
+                doc = data.get("doc")
+                if doc is not None:
+                    self.collectionDocs[data["name"]] = doc
             elif data["type"] == "run":
                 # Also support old form of saving a run with no extra info.
                 self.runs[data["name"]] = (None, Timespan(None, None))
@@ -310,12 +316,12 @@ def register(self) -> None:
         for datasetType in self.datasetTypes:
             self.registry.registerDatasetType(datasetType)
         for run in self.runs:
-            self.registry.registerRun(run)
+            self.registry.registerRun(run, doc=self.collectionDocs.get(run))
             # No way to add extra run info to registry yet.
         for collection, collection_type in self.collections.items():
-            self.registry.registerCollection(collection, collection_type)
+            self.registry.registerCollection(collection, collection_type, doc=self.collectionDocs.get(run))
         for chain, children in self.chains.items():
-            self.registry.registerCollection(chain, CollectionType.CHAINED)
+            self.registry.registerCollection(chain, CollectionType.CHAINED, doc=self.collectionDocs.get(run))
             self.registry.setCollectionChain(chain, children)
 
     def load(self, datastore: Optional[Datastore], *,