Skip to content

Commit

Permalink
Study sets (#316)
Browse files Browse the repository at this point in the history
* Start adding study collections, including some test artifacts

* Deal with study handle object versus string

* Version bump

* Add script to deal with collection tags

* Correct identification of database record

* Version bumps

* Allow uploading of preloaded images to save dev time

* Update pull policy to allow prebuilt remote data images

* Modify promotion mechanism to a whitelist

* Modify promotion mechanism to a whitelist, including api study name

* Add check for table existence

* Clean up cli for new publish/unpublish feature

* update changelog
  • Loading branch information
jimmymathews authored May 3, 2024
1 parent 8318ce3 commit d1d4ef8
Show file tree
Hide file tree
Showing 40 changed files with 855 additions and 56 deletions.
7 changes: 7 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,10 @@
# v0.23.0
Implements a dataset collection concept using study name suffixes (tags/tokens/labels):
- The tabular import workflow uses value for key `Study collection` in `study.json`.
- API endpoint `study-names` hides collection-tagged datasets by default.
- Other API handlers unchanged, work as-is using the fully-qualified study names.
- `spt db collection ... --publish / --unpublish` provided to managed collection visibility.

# v0.17.5
Organize workflow configuration options into a workflow configuration file.
This breaks the API for tabular import and similar.
Expand Down
23 changes: 21 additions & 2 deletions Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -78,7 +78,7 @@ SINGLETON_TEST_TARGETS := $(foreach submodule,$(SUBMODULES),singleton-test-$(sub
DLI := force-rebuild-data-loaded-image

# Define PHONY targets
.PHONY: help release-package check-for-pypi-credentials print-source-files build-and-push-docker-images ${DOCKER_PUSH_TARGETS} build-docker-images test module-tests ${MODULE_TEST_TARGETS} ${UNIT_TEST_TARGETS} clean clean-files docker-compositions-rm clean-network-environment
.PHONY: help release-package check-for-pypi-credentials print-source-files build-and-push-docker-images ${DOCKER_PUSH_TARGETS} build-docker-images test module-tests ${MODULE_TEST_TARGETS} ${UNIT_TEST_TARGETS} clean clean-files docker-compositions-rm clean-network-environment generic-spt-push-target data-loaded-images-push-target

# Submodule-specific variables
export DB_SOURCE_LOCATION_ABSOLUTE := ${PWD}/${SOURCE_LOCATION}/db
Expand Down Expand Up @@ -167,7 +167,7 @@ pyproject.toml: pyproject.toml.unversioned ${BUILD_SCRIPTS_LOCATION_ABSOLUTE}/cr
print-source-files:
>@echo "${PACKAGE_SOURCE_FILES}" | tr ' ' '\n'

build-and-push-docker-images: ${DOCKER_PUSH_TARGETS} generic-spt-push-target
build-and-push-docker-images: ${DOCKER_PUSH_TARGETS} generic-spt-push-target data-loaded-images-push-target

build-and-push-docker-images-dev: ${DOCKER_PUSH_DEV_TARGETS}

Expand Down Expand Up @@ -222,6 +222,25 @@ generic-spt-push-target: build-docker-images check-for-docker-credentials
exit_code=$$(( exit_code1 + exit_code2 )); echo "$$exit_code" > status_code
>@${MESSAGE} end "Pushed." "Not pushed."

data-loaded-images-push-target:
>@${MESSAGE} start "Pushing preloaded data Docker containers"
>@repository_name_prefix=${DOCKER_ORG_NAME}/${DOCKER_REPO_PREFIX}-db-preloaded ; \
codes=0 ; \
for suffix in 1 2 1and2 1small 1smallnointensity; \
do \
existing=$$repository_name_prefix-$$suffix:latest ; \
tag=$$repository_name_prefix-$$suffix:${VERSION} ; \
docker tag $$existing $$tag ; \
docker push $$existing ; \
exitcode="$$?" ; \
codes=$$(( codes + exitcode )) ; \
docker push $$tag ; \
exitcode="$$?" ; \
codes=$$(( codes + exitcode )) ; \
done; \
echo "$$codes" > status_code
>@${MESSAGE} end "Pushed." "Not pushed."

check-for-docker-credentials:
>@${MESSAGE} start "Checking for Docker credentials in ~/.docker/config.json"
>@${PYTHON} ${BUILD_SCRIPTS_LOCATION_ABSOLUTE}/check_for_credentials.py docker ; status="$$?"; echo "$$status" > status_code; if [[ "$$status" == "0" ]]; then touch check-for-docker-credentials; fi;
Expand Down
1 change: 0 additions & 1 deletion build/apiserver/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ services:
- isolated_temporary_test
environment:
POSTGRES_PASSWORD: postgres
pull_policy: never

testing-fast-counts-server:
image: nadeemlab/spt-ondemand
Expand Down
7 changes: 7 additions & 0 deletions build/build_scripts/.graph.small.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,7 @@
[general]
db_config_file_path = build/db/.spt_db.config.local
study_name = Melanoma intralesional IL2 collection: abc-123

[upload-importances]
plugin_used = cg-gnn
datetime_of_run = 2023-10-02 10:46 AM
9 changes: 9 additions & 0 deletions build/build_scripts/.workflow.small.config
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
[general]
db_config_file = build/db/.spt_db.config.local
executor = local

[tabular import]
input_path = test/test_data/adi_preprocessed_tables/datasetYYY

[database visitor]
study_name = Melanoma intralesional IL2 collection: abc-123
12 changes: 9 additions & 3 deletions build/build_scripts/import_test_dataset1small.sh
Original file line number Diff line number Diff line change
@@ -1,16 +1,22 @@

FM=test/test_data/adi_preprocessed_tables/dataset1/file_manifest.tsv
cp $FM file_manifest.tsv.bak
(cat file_manifest.tsv.bak | grep -vE '(0_2|0_3|6_2|6_3|6_4)') > $FM
(cat file_manifest.tsv.bak | grep -vE '(0_2|0_3|6_2|6_3|6_4)' | sed 's/Melanoma intralesional IL2/Melanoma intralesional IL2 collection: abc-123/g') > $FM
STUDY_JSON=test/test_data/adi_preprocessed_tables/dataset1/study.json
STUDY_SMALL_JSON=test/test_data/adi_preprocessed_tables/dataset1/study.small.json
cp $STUDY_JSON study.json.bak
cp $STUDY_SMALL_JSON $STUDY_JSON

cat build/build_scripts/.workflow.config | sed 's/YYY/1/g' > .workflow.config
cat build/build_scripts/.workflow.small.config | sed 's/YYY/1/g' > .workflow.config
spt workflow configure --workflow='tabular import' --config-file=.workflow.config
nextflow run .

cp file_manifest.tsv.bak $FM
rm file_manifest.tsv.bak
cp study.json.bak $STUDY_JSON
rm study.json.bak

spt graphs upload-importances --config_path=build/build_scripts/.graph.config --importances_csv_path test/test_data/gnn_importances/3.csv
spt graphs upload-importances --config_path=build/build_scripts/.graph.small.config --importances_csv_path test/test_data/gnn_importances/3.csv

cat work/*/*/.command.log
spt db status --database-config-file build/db/.spt_db.config.local > table_counts.txt
Expand Down
1 change: 0 additions & 1 deletion build/db/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ services:
- isolated_temporary_test
environment:
POSTGRES_PASSWORD: postgres
pull_policy: never

networks:
isolated_temporary_test:
Expand Down
1 change: 0 additions & 1 deletion build/graphs/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ services:
- isolated_temporary_test
environment:
POSTGRES_PASSWORD: postgres
pull_policy: never

networks:
isolated_temporary_test:
Expand Down
1 change: 0 additions & 1 deletion build/ondemand/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,6 @@ services:
- isolated_temporary_test
environment:
POSTGRES_PASSWORD: postgres
pull_policy: never

networks:
isolated_temporary_test:
Expand Down
1 change: 0 additions & 1 deletion build/workflow/compose.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,6 @@ services:
- isolated_temporary_test
environment:
POSTGRES_PASSWORD: postgres
pull_policy: never

networks:
isolated_temporary_test:
Expand Down
1 change: 1 addition & 0 deletions pyproject.toml.unversioned
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,7 @@ packages = [
"drop_ondemand_computations.py",
"delete_feature.py",
"upload_sync_findings.py",
"collection.py",
]
"spatialprofilingtoolbox.db.data_model" = [
"metaschema.sql",
Expand Down
2 changes: 1 addition & 1 deletion spatialprofilingtoolbox/apiserver/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""API service"""
__version__ = '0.14.0'
__version__ = '0.23.0'
34 changes: 28 additions & 6 deletions spatialprofilingtoolbox/apiserver/app/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
"""The API service's endpoint handlers."""

from typing import cast
from typing import Annotated
import json
from io import BytesIO
from base64 import b64decode
Expand All @@ -14,6 +15,7 @@

import secure

from spatialprofilingtoolbox.db.study_tokens import StudyCollectionNaming
from spatialprofilingtoolbox.ondemand.service_client import OnDemandRequester
from spatialprofilingtoolbox.db.exchange_data_formats.study import StudyHandle
from spatialprofilingtoolbox.db.exchange_data_formats.study import StudySummary
Expand All @@ -40,7 +42,7 @@
ValidChannelListNegatives2,
ValidFeatureClass,
)
VERSION = '0.11.0'
VERSION = '0.23.0'

TITLE = 'Single cell studies data API'

Expand Down Expand Up @@ -105,10 +107,30 @@ async def get_root():


@app.get("/study-names/")
async def get_study_names() -> list[StudyHandle]:
async def get_study_names(
collection: Annotated[str | None, Query(max_length=512)] = None
) -> list[StudyHandle]:
"""The names of studies/datasets, with display names."""
specifiers = query().retrieve_study_specifiers()
handles = [query().retrieve_study_handle(study) for study in specifiers]
def is_public(study_handle: StudyHandle) -> bool:
if StudyCollectionNaming.is_untagged(study_handle):
return True
_, tag = StudyCollectionNaming.strip_extract_token(study_handle)
if query().is_public_collection(tag):
return True
return False
if collection is None:
handles = list(filter(is_public, map(query().retrieve_study_handle, specifiers)))
else:
if not StudyCollectionNaming.matches_tag_pattern(collection):
raise HTTPException(
status_code=404,
detail=f'Collection "{collection}" is not a valid collection string.',
)
def tagged(study_handle: StudyHandle) -> bool:
return StudyCollectionNaming.tagged_with(study_handle, collection)
handles = list(filter(tagged, map(query().retrieve_study_handle, specifiers)))
return handles


Expand Down Expand Up @@ -313,17 +335,17 @@ def get_squidpy_metrics(
@app.get("/cell-data/")
async def get_cell_data(
study: ValidStudy,
sample: str = Query(max_length=512),
sample: Annotated[str, Query(max_length=512)],
) -> CellData:
"""Get cell-level location and phenotype data."""
if not sample in query().get_sample_names(study):
raise HTTPException(status_code=404, detail=f'Sample "{sample}" does not exist.')
number_cells = cast(int, query().get_number_cells(study))
def match(c: PhenotypeCount) -> bool:
return c.specimen == sample
count = tuple(filter(match, get_phenotype_counts([], [], study, number_cells).counts))[0]
if count.count > CELL_DATA_CELL_LIMIT:
message = f'Sample "{sample}" has too many cells: {count.count}.'
count = tuple(filter(match, get_phenotype_counts([], [], study, number_cells).counts))[0].count
if count is None or count > CELL_DATA_CELL_LIMIT:
message = f'Sample "{sample}" has too many cells: {count}.'
raise HTTPException(status_code=404, detail=message)
with OnDemandRequester(service='cells') as requester:
payload = requester.get_cells_data(study, sample)
Expand Down
2 changes: 1 addition & 1 deletion spatialprofilingtoolbox/db/__init__.py
Original file line number Diff line number Diff line change
@@ -1,2 +1,2 @@
"""Database-related SPT functionality."""
__version__ = '0.13.0'
__version__ = '0.23.0'
11 changes: 10 additions & 1 deletion spatialprofilingtoolbox/db/accessors/study.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,10 @@

from typing import cast
import re
from spatialprofilingtoolbox.db.simple_method_cache import simple_instance_method_cache

from psycopg2.errors import UndefinedTable

from spatialprofilingtoolbox.db.simple_method_cache import simple_instance_method_cache
from spatialprofilingtoolbox.workflow.common.export_features import ADIFeatureSpecificationUploader
from spatialprofilingtoolbox.db.exchange_data_formats.study import (
StudyContact,
Expand Down Expand Up @@ -107,6 +109,13 @@ def get_study_handle(self, study: str) -> StudyHandle:
handles.append(StudyHandle(handle=handle, display_name_detail=display_name_detail))
return handles[0]

def get_collection_whitelist(self) -> tuple[str, ...]:
try:
self.cursor.execute('SELECT collection FROM collection_whitelist ;')
except UndefinedTable:
return ()
return tuple(map(lambda row: row[0], self.cursor.fetchall()))

def _get_publication_summary_text(self, study: str) -> str:
query = '''
SELECT publisher, date_of_publication
Expand Down
14 changes: 13 additions & 1 deletion spatialprofilingtoolbox/db/database_connection.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,17 @@
logger = colorized_logger(__name__)


class DatabaseNotFoundError(ValueError):
study: str

def __init__(self, study: str):
self.study = study
super().__init__(self.verbalize())

def verbalize(self) -> str:
return f'Did not find database for study named: "{self.study}"'


class ConnectionProvider:
"""Simple wrapper of a database connection."""
connection: Connection
Expand Down Expand Up @@ -80,7 +91,7 @@ def _retrieve_study_database(self, credentials: DBCredentials, study: str) -> st
cursor.execute('SELECT database_name FROM study_lookup WHERE study=%s', (study,))
rows = cursor.fetchall()
if len(rows) == 0:
raise ValueError('Did not find database for study "%s"', study)
raise DatabaseNotFoundError(study)
return str(rows[0][0])

@staticmethod
Expand Down Expand Up @@ -255,6 +266,7 @@ class (QueryCursor) newly provides on each invocation.
get_sample_names: Callable
get_available_gnn: Callable
get_study_findings: Callable
is_public_collection: Callable

def __init__(self, query_handler: Type):
self.query_handler = query_handler
Expand Down
63 changes: 63 additions & 0 deletions spatialprofilingtoolbox/db/publish_promote.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
"""Publish/promote a dataset collection from private to public."""

from typing import cast

from attr import define

from spatialprofilingtoolbox.db.database_connection import DBCursor
from spatialprofilingtoolbox.db.study_tokens import StudyCollectionNaming
from spatialprofilingtoolbox.standalone_utilities.log_formats import colorized_logger

logger = colorized_logger(__name__)

@define
class PublisherPromoter:
database_config_file: str
collection: str | None = None

def promote(self, collection: str) -> None:
self.collection = collection
self._check_is_collection_nonempty()
self._whitelist_collection()

def demote(self, collection: str) -> None:
self.collection = collection
self._check_is_collection_nonempty()
self._unwhitelist_collection()

def _check_is_collection_nonempty(self) -> None:
def is_in_collection(study: str) -> bool:
_, tag = StudyCollectionNaming.strip_token(study)
return tag == self._get_collection()
file = self.database_config_file
with DBCursor(database_config_file=file, study=None) as cursor:
update = f'SELECT study FROM study_lookup ;'
cursor.execute(update)
members = tuple(filter(is_in_collection, map(lambda row: row[0], cursor.fetchall())))
if len(members) == 0:
message = f'No studies are tagged with collection label "{self._get_collection()}".'
logger.warn(message)

def _whitelist_collection(self) -> None:
file = self.database_config_file
with DBCursor(database_config_file=file, study=None) as cursor:
collection = self._get_collection()
create = f'CREATE TABLE IF NOT EXISTS collection_whitelist ( collection VARCHAR(512) );'
insert = f'INSERT INTO collection_whitelist (collection) VALUES ( %s ) ;'
logger.debug(create)
cursor.execute(create)
logger.debug(insert % f"'{collection}'")
cursor.execute(insert, (collection,))
logger.info(f'Added "{collection}" to public-indicating whitelist.')

def _unwhitelist_collection(self) -> None:
file = self.database_config_file
with DBCursor(database_config_file=file, study=None) as cursor:
collection = self._get_collection()
remove = f'DELETE FROM collection_whitelist WHERE collection=%s ;'
logger.debug(remove % f"'{collection}'")
cursor.execute(remove, (collection,))
logger.info(f'Removed "{collection}" from public-indicating whitelist.')

def _get_collection(self) -> str:
return cast(str, self.collection)
5 changes: 5 additions & 0 deletions spatialprofilingtoolbox/db/querying.py
Original file line number Diff line number Diff line change
Expand Up @@ -49,6 +49,11 @@ def get_number_cells(cls, cursor, study: str) -> int:
def get_study_summary(cls, cursor, study: str) -> StudySummary:
return StudyAccess(cursor).get_study_summary(study)

@classmethod
def is_public_collection(cls, cursor, collection: str) -> bool:
whitelist = StudyAccess(cursor).get_collection_whitelist()
return collection in whitelist

@classmethod
def get_available_gnn(cls, cursor, study: str) -> AvailableGNN:
return StudyAccess(cursor).get_available_gnn(study)
Expand Down
Loading

0 comments on commit d1d4ef8

Please sign in to comment.