Study sets (#316)

* Start adding study collections, including some test artifacts * Deal with study handle object versus string * Version bump * Add script to deal with collection tags * Correct identification of database record * Version bumps * Allow uploading of preloaded images to save dev time * Update pull policy to allow prebuilt remote data images * Modify promotion mechanism to a whitelist * Modify promotion mechanism to a whitelist, including api study name * Add check for table existence * Clean up cli for new publish/unpublish feature * update changelog
nadeemlab · May 3, 2024 · d1d4ef8 · d1d4ef8
1 parent 8318ce3
commit d1d4ef8
Show file tree

Hide file tree

Showing 40 changed files with 855 additions and 56 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -1,3 +1,10 @@
+# v0.23.0
+Implements a dataset collection concept using study name suffixes (tags/tokens/labels):
+- The tabular import workflow uses value for key `Study collection` in `study.json`.
+- API endpoint `study-names` hides collection-tagged datasets by default.
+- Other API handlers unchanged, work as-is using the fully-qualified study names.
+- `spt db collection ... --publish / --unpublish` provided to managed collection visibility.
+
 # v0.17.5
 Organize workflow configuration options into a workflow configuration file.
 This breaks the API for tabular import and similar.

diff --git a/Makefile b/Makefile
@@ -78,7 +78,7 @@ SINGLETON_TEST_TARGETS := $(foreach submodule,$(SUBMODULES),singleton-test-$(sub
 DLI := force-rebuild-data-loaded-image
 
 # Define PHONY targets
-.PHONY: help release-package check-for-pypi-credentials print-source-files build-and-push-docker-images ${DOCKER_PUSH_TARGETS} build-docker-images test module-tests ${MODULE_TEST_TARGETS} ${UNIT_TEST_TARGETS} clean clean-files docker-compositions-rm clean-network-environment
+.PHONY: help release-package check-for-pypi-credentials print-source-files build-and-push-docker-images ${DOCKER_PUSH_TARGETS} build-docker-images test module-tests ${MODULE_TEST_TARGETS} ${UNIT_TEST_TARGETS} clean clean-files docker-compositions-rm clean-network-environment generic-spt-push-target data-loaded-images-push-target
 
 # Submodule-specific variables
 export DB_SOURCE_LOCATION_ABSOLUTE := ${PWD}/${SOURCE_LOCATION}/db
@@ -167,7 +167,7 @@ pyproject.toml: pyproject.toml.unversioned ${BUILD_SCRIPTS_LOCATION_ABSOLUTE}/cr
 print-source-files:
 >@echo "${PACKAGE_SOURCE_FILES}" | tr ' ' '\n'
 
-build-and-push-docker-images: ${DOCKER_PUSH_TARGETS} generic-spt-push-target
+build-and-push-docker-images: ${DOCKER_PUSH_TARGETS} generic-spt-push-target data-loaded-images-push-target
 
 build-and-push-docker-images-dev: ${DOCKER_PUSH_DEV_TARGETS}
 
@@ -222,6 +222,25 @@ generic-spt-push-target: build-docker-images check-for-docker-credentials
     exit_code=$$(( exit_code1 + exit_code2 )); echo "$$exit_code" > status_code
 >@${MESSAGE} end "Pushed." "Not pushed."
 
+data-loaded-images-push-target:
+>@${MESSAGE} start "Pushing preloaded data Docker containers"
+>@repository_name_prefix=${DOCKER_ORG_NAME}/${DOCKER_REPO_PREFIX}-db-preloaded ; \
+    codes=0 ; \
+    for suffix in 1 2 1and2 1small 1smallnointensity; \
+    do \
+        existing=$$repository_name_prefix-$$suffix:latest ; \
+        tag=$$repository_name_prefix-$$suffix:${VERSION} ; \
+        docker tag $$existing $$tag ; \
+        docker push $$existing ; \
+        exitcode="$$?" ; \
+        codes=$$(( codes + exitcode )) ; \
+        docker push $$tag ; \
+        exitcode="$$?" ; \
+        codes=$$(( codes + exitcode )) ; \
+    done; \
+    echo "$$codes" > status_code
+>@${MESSAGE} end "Pushed." "Not pushed."
+
 check-for-docker-credentials:
 >@${MESSAGE} start "Checking for Docker credentials in ~/.docker/config.json"
 >@${PYTHON} ${BUILD_SCRIPTS_LOCATION_ABSOLUTE}/check_for_credentials.py docker ; status="$$?"; echo "$$status" > status_code; if [[ "$$status" == "0" ]]; then touch check-for-docker-credentials; fi;

diff --git a/build/apiserver/compose.yaml b/build/apiserver/compose.yaml
@@ -10,7 +10,6 @@ services:
       - isolated_temporary_test
     environment:
       POSTGRES_PASSWORD: postgres
-    pull_policy: never
 
   testing-fast-counts-server:
     image: nadeemlab/spt-ondemand

diff --git a/build/build_scripts/.graph.small.config b/build/build_scripts/.graph.small.config
@@ -0,0 +1,7 @@
+[general]
+db_config_file_path = build/db/.spt_db.config.local
+study_name = Melanoma intralesional IL2 collection: abc-123
+
+[upload-importances]
+plugin_used = cg-gnn
+datetime_of_run = 2023-10-02 10:46 AM
diff --git a/build/build_scripts/.workflow.small.config b/build/build_scripts/.workflow.small.config
@@ -0,0 +1,9 @@
+[general]
+db_config_file = build/db/.spt_db.config.local
+executor = local
+
+[tabular import]
+input_path = test/test_data/adi_preprocessed_tables/datasetYYY
+
+[database visitor]
+study_name = Melanoma intralesional IL2 collection: abc-123
diff --git a/build/build_scripts/import_test_dataset1small.sh b/build/build_scripts/import_test_dataset1small.sh
@@ -1,16 +1,22 @@
 
 FM=test/test_data/adi_preprocessed_tables/dataset1/file_manifest.tsv
 cp $FM file_manifest.tsv.bak
-(cat file_manifest.tsv.bak | grep -vE '(0_2|0_3|6_2|6_3|6_4)') > $FM
+(cat file_manifest.tsv.bak | grep -vE '(0_2|0_3|6_2|6_3|6_4)' | sed 's/Melanoma intralesional IL2/Melanoma intralesional IL2 collection: abc-123/g') > $FM
+STUDY_JSON=test/test_data/adi_preprocessed_tables/dataset1/study.json
+STUDY_SMALL_JSON=test/test_data/adi_preprocessed_tables/dataset1/study.small.json
+cp $STUDY_JSON study.json.bak
+cp $STUDY_SMALL_JSON $STUDY_JSON
 
-cat build/build_scripts/.workflow.config | sed 's/YYY/1/g' > .workflow.config
+cat build/build_scripts/.workflow.small.config | sed 's/YYY/1/g' > .workflow.config
 spt workflow configure --workflow='tabular import' --config-file=.workflow.config
 nextflow run .
 
 cp file_manifest.tsv.bak $FM
 rm file_manifest.tsv.bak
+cp study.json.bak $STUDY_JSON
+rm study.json.bak
 
-spt graphs upload-importances --config_path=build/build_scripts/.graph.config --importances_csv_path test/test_data/gnn_importances/3.csv
+spt graphs upload-importances --config_path=build/build_scripts/.graph.small.config --importances_csv_path test/test_data/gnn_importances/3.csv
 
 cat work/*/*/.command.log
 spt db status --database-config-file build/db/.spt_db.config.local > table_counts.txt

diff --git a/build/db/compose.yaml b/build/db/compose.yaml
@@ -10,7 +10,6 @@ services:
       - isolated_temporary_test
     environment:
       POSTGRES_PASSWORD: postgres
-    pull_policy: never
 
 networks:
   isolated_temporary_test:

diff --git a/build/graphs/compose.yaml b/build/graphs/compose.yaml
@@ -10,7 +10,6 @@ services:
       - isolated_temporary_test
     environment:
       POSTGRES_PASSWORD: postgres
-    pull_policy: never
 
 networks:
   isolated_temporary_test:

diff --git a/build/ondemand/compose.yaml b/build/ondemand/compose.yaml
@@ -25,7 +25,6 @@ services:
       - isolated_temporary_test
     environment:
       POSTGRES_PASSWORD: postgres
-    pull_policy: never
 
 networks:
   isolated_temporary_test:

diff --git a/build/workflow/compose.yaml b/build/workflow/compose.yaml
@@ -10,7 +10,6 @@ services:
       - isolated_temporary_test
     environment:
       POSTGRES_PASSWORD: postgres
-    pull_policy: never
 
 networks:
   isolated_temporary_test:

diff --git a/pyproject.toml.unversioned b/pyproject.toml.unversioned
@@ -191,6 +191,7 @@ packages = [
     "drop_ondemand_computations.py",
     "delete_feature.py",
     "upload_sync_findings.py",
+    "collection.py",
 ]
 "spatialprofilingtoolbox.db.data_model" = [
     "metaschema.sql",

diff --git a/spatialprofilingtoolbox/apiserver/__init__.py b/spatialprofilingtoolbox/apiserver/__init__.py
@@ -1,2 +1,2 @@
 """API service"""
-__version__ = '0.14.0'
+__version__ = '0.23.0'
diff --git a/spatialprofilingtoolbox/apiserver/app/main.py b/spatialprofilingtoolbox/apiserver/app/main.py
@@ -1,6 +1,7 @@
 """The API service's endpoint handlers."""
 
 from typing import cast
+from typing import Annotated
 import json
 from io import BytesIO
 from base64 import b64decode
@@ -14,6 +15,7 @@
 
 import secure
 
+from spatialprofilingtoolbox.db.study_tokens import StudyCollectionNaming
 from spatialprofilingtoolbox.ondemand.service_client import OnDemandRequester
 from spatialprofilingtoolbox.db.exchange_data_formats.study import StudyHandle
 from spatialprofilingtoolbox.db.exchange_data_formats.study import StudySummary
@@ -40,7 +42,7 @@
     ValidChannelListNegatives2,
     ValidFeatureClass,
 )
-VERSION = '0.11.0'
+VERSION = '0.23.0'
 
 TITLE = 'Single cell studies data API'
 
@@ -105,10 +107,30 @@ async def get_root():
 
 
 @app.get("/study-names/")
-async def get_study_names() -> list[StudyHandle]:
+async def get_study_names(
+    collection: Annotated[str | None, Query(max_length=512)] = None
+) -> list[StudyHandle]:
     """The names of studies/datasets, with display names."""
     specifiers = query().retrieve_study_specifiers()
     handles = [query().retrieve_study_handle(study) for study in specifiers]
+    def is_public(study_handle: StudyHandle) -> bool:
+        if StudyCollectionNaming.is_untagged(study_handle):
+            return True
+        _, tag = StudyCollectionNaming.strip_extract_token(study_handle)
+        if query().is_public_collection(tag):
+            return True
+        return False
+    if collection is None:
+        handles = list(filter(is_public, map(query().retrieve_study_handle, specifiers)))
+    else:
+        if not StudyCollectionNaming.matches_tag_pattern(collection):
+            raise HTTPException(
+                status_code=404,
+                detail=f'Collection "{collection}" is not a valid collection string.',
+            )
+        def tagged(study_handle: StudyHandle) -> bool:
+            return StudyCollectionNaming.tagged_with(study_handle, collection)
+        handles = list(filter(tagged, map(query().retrieve_study_handle, specifiers)))
     return handles
 
 
@@ -313,17 +335,17 @@ def get_squidpy_metrics(
 @app.get("/cell-data/")
 async def get_cell_data(
     study: ValidStudy,
-    sample: str = Query(max_length=512),
+    sample: Annotated[str, Query(max_length=512)],
 ) -> CellData:
     """Get cell-level location and phenotype data."""
     if not sample in query().get_sample_names(study):
         raise HTTPException(status_code=404, detail=f'Sample "{sample}" does not exist.')
     number_cells = cast(int, query().get_number_cells(study))
     def match(c: PhenotypeCount) -> bool:
         return c.specimen == sample
-    count = tuple(filter(match, get_phenotype_counts([], [], study, number_cells).counts))[0]
-    if count.count > CELL_DATA_CELL_LIMIT:
-        message = f'Sample "{sample}" has too many cells: {count.count}.'
+    count = tuple(filter(match, get_phenotype_counts([], [], study, number_cells).counts))[0].count
+    if count is None or count > CELL_DATA_CELL_LIMIT:
+        message = f'Sample "{sample}" has too many cells: {count}.'
         raise HTTPException(status_code=404, detail=message)
     with OnDemandRequester(service='cells') as requester:
         payload = requester.get_cells_data(study, sample)

diff --git a/spatialprofilingtoolbox/db/__init__.py b/spatialprofilingtoolbox/db/__init__.py
@@ -1,2 +1,2 @@
 """Database-related SPT functionality."""
-__version__ = '0.13.0'
+__version__ = '0.23.0'
diff --git a/spatialprofilingtoolbox/db/accessors/study.py b/spatialprofilingtoolbox/db/accessors/study.py
@@ -2,8 +2,10 @@
 
 from typing import cast
 import re
-from spatialprofilingtoolbox.db.simple_method_cache import simple_instance_method_cache
 
+from psycopg2.errors import UndefinedTable
+
+from spatialprofilingtoolbox.db.simple_method_cache import simple_instance_method_cache
 from spatialprofilingtoolbox.workflow.common.export_features import ADIFeatureSpecificationUploader
 from spatialprofilingtoolbox.db.exchange_data_formats.study import (
     StudyContact,
@@ -107,6 +109,13 @@ def get_study_handle(self, study: str) -> StudyHandle:
             handles.append(StudyHandle(handle=handle, display_name_detail=display_name_detail))
         return handles[0]
 
+    def get_collection_whitelist(self) -> tuple[str, ...]:
+        try:
+            self.cursor.execute('SELECT collection FROM collection_whitelist ;')
+        except UndefinedTable:
+            return ()
+        return tuple(map(lambda row: row[0], self.cursor.fetchall()))
+
     def _get_publication_summary_text(self, study: str) -> str:
         query = '''
         SELECT publisher, date_of_publication

diff --git a/spatialprofilingtoolbox/db/database_connection.py b/spatialprofilingtoolbox/db/database_connection.py
@@ -26,6 +26,17 @@
 logger = colorized_logger(__name__)
 
 
+class DatabaseNotFoundError(ValueError):
+    study: str
+
+    def __init__(self, study: str):
+        self.study = study
+        super().__init__(self.verbalize())
+
+    def verbalize(self) -> str:
+        return f'Did not find database for study named: "{self.study}"'
+
+
 class ConnectionProvider:
     """Simple wrapper of a database connection."""
     connection: Connection
@@ -80,7 +91,7 @@ def _retrieve_study_database(self, credentials: DBCredentials, study: str) -> st
             cursor.execute('SELECT database_name FROM study_lookup WHERE study=%s', (study,))
             rows = cursor.fetchall()
             if len(rows) == 0:
-                raise ValueError('Did not find database for study "%s"', study)
+                raise DatabaseNotFoundError(study)
             return str(rows[0][0])
 
     @staticmethod
@@ -255,6 +266,7 @@ class (QueryCursor) newly provides on each invocation.
     get_sample_names: Callable
     get_available_gnn: Callable
     get_study_findings: Callable
+    is_public_collection: Callable
 
     def __init__(self, query_handler: Type):
         self.query_handler = query_handler

diff --git a/spatialprofilingtoolbox/db/publish_promote.py b/spatialprofilingtoolbox/db/publish_promote.py
@@ -0,0 +1,63 @@
+"""Publish/promote a dataset collection from private to public."""
+
+from typing import cast
+
+from attr import define
+
+from spatialprofilingtoolbox.db.database_connection import DBCursor
+from spatialprofilingtoolbox.db.study_tokens import StudyCollectionNaming
+from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger
+
+logger = colorized_logger(__name__)
+
+@define
+class PublisherPromoter:
+    database_config_file: str
+    collection: str | None = None
+
+    def promote(self, collection: str) -> None:
+        self.collection = collection
+        self._check_is_collection_nonempty()
+        self._whitelist_collection()
+
+    def demote(self, collection: str) -> None:
+        self.collection = collection
+        self._check_is_collection_nonempty()
+        self._unwhitelist_collection()
+
+    def _check_is_collection_nonempty(self) -> None:
+        def is_in_collection(study: str) -> bool:
+            _, tag = StudyCollectionNaming.strip_token(study)
+            return tag == self._get_collection()
+        file = self.database_config_file
+        with DBCursor(database_config_file=file, study=None) as cursor:
+            update = f'SELECT study FROM study_lookup ;'
+            cursor.execute(update)
+            members = tuple(filter(is_in_collection, map(lambda row: row[0], cursor.fetchall())))
+        if len(members) == 0:
+            message = f'No studies are tagged with collection label "{self._get_collection()}".'
+            logger.warn(message)
+
+    def _whitelist_collection(self) -> None:
+        file = self.database_config_file
+        with DBCursor(database_config_file=file, study=None) as cursor:
+            collection = self._get_collection()
+            create = f'CREATE TABLE IF NOT EXISTS collection_whitelist ( collection VARCHAR(512) );'
+            insert = f'INSERT INTO collection_whitelist (collection) VALUES ( %s ) ;'
+            logger.debug(create)
+            cursor.execute(create)
+            logger.debug(insert % f"'{collection}'")
+            cursor.execute(insert, (collection,))
+        logger.info(f'Added "{collection}" to public-indicating whitelist.')
+
+    def _unwhitelist_collection(self) -> None:
+        file = self.database_config_file
+        with DBCursor(database_config_file=file, study=None) as cursor:
+            collection = self._get_collection()
+            remove = f'DELETE FROM collection_whitelist WHERE collection=%s ;'
+            logger.debug(remove % f"'{collection}'")
+            cursor.execute(remove, (collection,))
+        logger.info(f'Removed "{collection}" from public-indicating whitelist.')
+
+    def _get_collection(self) -> str:
+        return cast(str, self.collection)
diff --git a/spatialprofilingtoolbox/db/querying.py b/spatialprofilingtoolbox/db/querying.py
@@ -49,6 +49,11 @@ def get_number_cells(cls, cursor, study: str) -> int:
     def get_study_summary(cls, cursor, study: str) -> StudySummary:
         return StudyAccess(cursor).get_study_summary(study)
 
+    @classmethod
+    def is_public_collection(cls, cursor, collection: str) -> bool:
+        whitelist = StudyAccess(cursor).get_collection_whitelist()
+        return collection in whitelist
+
     @classmethod
     def get_available_gnn(cls, cursor, study: str) -> AvailableGNN:
         return StudyAccess(cursor).get_available_gnn(study)