From 7ec75c2485cc05449a539133cf4e60f3a867594b Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 15:34:18 +0100 Subject: [PATCH 1/8] IMP: Allow gs:// paths to be used directly with resources --- octue/cloud/storage/client.py | 91 ++++++++++++++++++++------------ octue/resources/datafile.py | 65 +++++++++++++++-------- octue/resources/dataset.py | 28 ++++++---- octue/resources/manifest.py | 32 +++++++---- tests/resources/test_datafile.py | 53 ++++++++++++------- tests/resources/test_dataset.py | 2 +- tests/resources/test_manifest.py | 6 +-- 7 files changed, 179 insertions(+), 98 deletions(-) diff --git a/octue/cloud/storage/client.py b/octue/cloud/storage/client.py index d4ba9f087..32df9500b 100644 --- a/octue/cloud/storage/client.py +++ b/octue/cloud/storage/client.py @@ -5,6 +5,7 @@ from google_crc32c import Checksum from octue.cloud.credentials import GCPCredentialsManager +from octue.cloud.storage.path import split_bucket_name_from_gs_path logger = logging.getLogger(__name__) @@ -46,17 +47,20 @@ def create_bucket(self, name, location=None, allow_existing=False, timeout=_DEFA self.client.create_bucket(bucket_or_name=name, location=location, timeout=timeout) - def upload_file(self, local_path, bucket_name, path_in_bucket, metadata=None, timeout=_DEFAULT_TIMEOUT): + def upload_file( + self, local_path, gs_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT + ): """Upload a local file to a Google Cloud bucket at gs:///. :param str local_path: - :param str bucket_name: - :param str path_in_bucket: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :param dict metadata: :param float timeout: :return None: """ - blob = self._blob(bucket_name, path_in_bucket) + blob = self._blob(gs_path, bucket_name, path_in_bucket) with open(local_path) as f: blob.crc32c = self._compute_crc32c_checksum(f.read()) @@ -65,69 +69,81 @@ def upload_file(self, local_path, bucket_name, path_in_bucket, metadata=None, ti self._update_metadata(blob, metadata) logger.info("Uploaded %r to Google Cloud at %r.", local_path, blob.public_url) - def upload_from_string(self, string, bucket_name, path_in_bucket, metadata=None, timeout=_DEFAULT_TIMEOUT): + def upload_from_string( + self, string, gs_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT + ): """Upload serialised data in string form to a file in a Google Cloud bucket at gs:///. :param str string: - :param str bucket_name: - :param str path_in_bucket: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :param dict metadata: :param float timeout: :return None: """ - blob = self._blob(bucket_name, path_in_bucket) + blob = self._blob(gs_path, bucket_name, path_in_bucket) blob.crc32c = self._compute_crc32c_checksum(string) blob.upload_from_string(data=string, timeout=timeout) self._update_metadata(blob, metadata) logger.info("Uploaded data to Google Cloud at %r.", blob.public_url) - def update_metadata(self, bucket_name, path_in_bucket, metadata): + def update_metadata(self, metadata, gs_path=None, bucket_name=None, path_in_bucket=None): """Update the metadata for the given cloud file. - :param str bucket_name: - :param str path_in_bucket: :param dict metadata: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :return None: """ - blob = self._blob(bucket_name, path_in_bucket) + blob = self._blob(gs_path, bucket_name, path_in_bucket) self._update_metadata(blob, metadata) - def download_to_file(self, bucket_name, path_in_bucket, local_path, timeout=_DEFAULT_TIMEOUT): + def download_to_file( + self, local_path, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT + ): """Download a file to a file from a Google Cloud bucket at gs:///. - :param str bucket_name: - :param str path_in_bucket: :param str local_path: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :param float timeout: :return None: """ - blob = self._blob(bucket_name, path_in_bucket) + blob = self._blob(gs_path, bucket_name, path_in_bucket) blob.download_to_filename(local_path, timeout=timeout) logger.info("Downloaded %r from Google Cloud to %r.", blob.public_url, local_path) - def download_as_string(self, bucket_name, path_in_bucket, timeout=_DEFAULT_TIMEOUT): + def download_as_string(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): """Download a file to a string from a Google Cloud bucket at gs:///. - :param str bucket_name: - :param str path_in_bucket: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :param float timeout: :return str: """ - blob = self._blob(bucket_name, path_in_bucket) + blob = self._blob(gs_path, bucket_name, path_in_bucket) data = blob.download_as_bytes(timeout=timeout) logger.info("Downloaded %r from Google Cloud to as string.", blob.public_url) return data.decode() - def get_metadata(self, bucket_name, path_in_bucket, timeout=_DEFAULT_TIMEOUT): + def get_metadata(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): """Get the metadata of the given file in the given bucket. - :param str bucket_name: - :param str path_in_bucket: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :param float timeout: :return dict: """ + if gs_path: + bucket_name, path_in_bucket = split_bucket_name_from_gs_path(gs_path) + bucket = self.client.get_bucket(bucket_or_name=bucket_name) blob = bucket.get_blob(blob_name=self._strip_leading_slash(path_in_bucket), timeout=timeout) @@ -147,27 +163,32 @@ def get_metadata(self, bucket_name, path_in_bucket, timeout=_DEFAULT_TIMEOUT): "path_in_bucket": path_in_bucket, } - def delete(self, bucket_name, path_in_bucket, timeout=_DEFAULT_TIMEOUT): + def delete(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): """Delete the given file from the given bucket. - :param str bucket_name: - :param str path_in_bucket: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :param float timeout: :return None: """ - blob = self._blob(bucket_name, path_in_bucket) + blob = self._blob(gs_path, bucket_name, path_in_bucket) blob.delete(timeout=timeout) logger.info("Deleted %r from Google Cloud.", blob.public_url) - def scandir(self, bucket_name, directory_path, filter=None, timeout=_DEFAULT_TIMEOUT): + def scandir(self, gs_path=None, bucket_name=None, directory_path=None, filter=None, timeout=_DEFAULT_TIMEOUT): """Yield the blobs belonging to the given "directory" in the given bucket. - :param str bucket_name: - :param str directory_path: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None directory_path: :param callable filter: :param float timeout: :yield google.cloud.storage.blob.Blob: """ + if gs_path: + bucket_name, path_in_bucket = split_bucket_name_from_gs_path(gs_path) + bucket = self.client.get_bucket(bucket_or_name=bucket_name) blobs = bucket.list_blobs(timeout=timeout) directory_path = self._strip_leading_slash(directory_path) @@ -185,13 +206,17 @@ def _strip_leading_slash(self, path): """ return path.lstrip("/") - def _blob(self, bucket_name, path_in_bucket): + def _blob(self, gs_path=None, bucket_name=None, path_in_bucket=None): """Instantiate a blob for the given bucket at the given path. Note that this is not synced up with Google Cloud. - :param str bucket_name: - :param str path_in_bucket: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_in_bucket: :return google.cloud.storage.blob.Blob: """ + if gs_path: + bucket_name, path_in_bucket = split_bucket_name_from_gs_path(gs_path) + bucket = self.client.get_bucket(bucket_or_name=bucket_name) return bucket.blob(blob_name=self._strip_leading_slash(path_in_bucket)) diff --git a/octue/resources/datafile.py b/octue/resources/datafile.py index 7d438a60f..2c636280f 100644 --- a/octue/resources/datafile.py +++ b/octue/resources/datafile.py @@ -160,8 +160,9 @@ def deserialise(cls, serialised_datafile, path_from=None): def from_cloud( cls, project_name, - bucket_name, - datafile_path, + gs_path=None, + bucket_name=None, + datafile_path=None, allow_overwrite=False, mode="r", update_cloud_metadata=True, @@ -175,8 +176,9 @@ def from_cloud( Note that a value provided for an attribute in kwargs will override any existing value for the attribute. :param str project_name: - :param str bucket_name: - :param str datafile_path: path to file represented by datafile + :param str|None gs_path: + :param str|None bucket_name: + :param str|None datafile_path: path to file represented by datafile :param bool allow_overwrite: if `True`, allow attributes of the datafile to be overwritten by values given in kwargs :param str mode: if using as a context manager, open the datafile for reading/editing in this mode (the mode @@ -185,8 +187,11 @@ def from_cloud( the datafile when the context is exited :return Datafile: """ - datafile = cls(path=storage.path.generate_gs_path(bucket_name, datafile_path)) - datafile.get_cloud_metadata(project_name, bucket_name, datafile_path) + if not gs_path: + gs_path = storage.path.generate_gs_path(bucket_name, datafile_path) + + datafile = cls(path=gs_path) + datafile.get_cloud_metadata(project_name, gs_path=gs_path) custom_metadata = datafile._cloud_metadata.get("custom_metadata", {}) if not allow_overwrite: @@ -198,7 +203,6 @@ def from_cloud( timestamp = datetime.datetime.strptime(timestamp, "%Y-%m-%d %H:%M:%S.%f%z") datafile._set_id(kwargs.pop("id", custom_metadata.get(f"{OCTUE_METADATA_NAMESPACE}__id", ID_DEFAULT))) - datafile.path = storage.path.generate_gs_path(bucket_name, datafile_path) datafile.timestamp = timestamp datafile.immutable_hash_value = datafile._cloud_metadata.get("crc32c", EMPTY_STRING_HASH_VALUE) datafile.cluster = kwargs.pop( @@ -211,23 +215,27 @@ def from_cloud( datafile._open_attributes = {"mode": mode, "update_cloud_metadata": update_cloud_metadata, **kwargs} return datafile - def to_cloud(self, project_name=None, bucket_name=None, path_in_bucket=None, update_cloud_metadata=True): + def to_cloud( + self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None, update_cloud_metadata=True + ): """Upload a datafile to Google Cloud Storage. :param str|None project_name: + :param str|None gs_path: :param str|None bucket_name: :param str|None path_in_bucket: :param bool update_cloud_metadata: :return str: gs:// path for datafile """ - project_name, bucket_name, path_in_bucket = self._get_cloud_location(project_name, bucket_name, path_in_bucket) - self.get_cloud_metadata(project_name, bucket_name, path_in_bucket) + project_name, bucket_name, path_in_bucket = self._get_cloud_location( + project_name, gs_path, bucket_name, path_in_bucket + ) - storage_client = GoogleCloudStorageClient(project_name=project_name) + self.get_cloud_metadata(project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket) # If the datafile's file has been changed locally, overwrite its cloud copy. if self._cloud_metadata.get("crc32c") != self.hash_value: - storage_client.upload_file( + GoogleCloudStorageClient(project_name=project_name).upload_file( local_path=self.get_local_path(), bucket_name=bucket_name, path_in_bucket=path_in_bucket, @@ -239,20 +247,26 @@ def to_cloud(self, project_name=None, bucket_name=None, path_in_bucket=None, upd local_metadata = self.metadata() if self._cloud_metadata.get("custom_metadata") != local_metadata: - self.update_cloud_metadata(project_name, bucket_name, path_in_bucket) + self.update_cloud_metadata(project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket) - return storage.path.generate_gs_path(bucket_name, path_in_bucket) + return gs_path or storage.path.generate_gs_path(bucket_name, path_in_bucket) - def get_cloud_metadata(self, project_name=None, bucket_name=None, path_in_bucket=None): + def get_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): """Get the cloud metadata for the datafile, casting the types of the cluster and sequence fields to integer. :param str|None project_name: + :param str|None gs_path: :param str|None bucket_name: :param str|None path_in_bucket: :return dict: """ - project_name, bucket_name, path_in_bucket = self._get_cloud_location(project_name, bucket_name, path_in_bucket) - cloud_metadata = GoogleCloudStorageClient(project_name).get_metadata(bucket_name, path_in_bucket) + project_name, bucket_name, path_in_bucket = self._get_cloud_location( + project_name, gs_path, bucket_name, path_in_bucket + ) + + cloud_metadata = GoogleCloudStorageClient(project_name).get_metadata( + bucket_name=bucket_name, path_in_bucket=path_in_bucket + ) if cloud_metadata is None: return None @@ -271,20 +285,23 @@ def get_cloud_metadata(self, project_name=None, bucket_name=None, path_in_bucket self._cloud_metadata = cloud_metadata - def update_cloud_metadata(self, project_name=None, bucket_name=None, path_in_bucket=None): + def update_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): """Update the cloud metadata for the datafile. :param str|None project_name: + :param str|None gs_path: :param str|None bucket_name: :param str|None path_in_bucket: :return None: """ - project_name, bucket_name, path_in_bucket = self._get_cloud_location(project_name, bucket_name, path_in_bucket) + project_name, bucket_name, path_in_bucket = self._get_cloud_location( + project_name, gs_path, bucket_name, path_in_bucket + ) GoogleCloudStorageClient(project_name=project_name).update_metadata( + metadata=self.metadata(), bucket_name=bucket_name, path_in_bucket=path_in_bucket, - metadata=self.metadata(), ) self._store_cloud_location(project_name, bucket_name, path_in_bucket) @@ -372,7 +389,7 @@ def get_local_path(self): temporary_local_path = tempfile.NamedTemporaryFile(delete=False).name GoogleCloudStorageClient(project_name=self._cloud_metadata["project_name"]).download_to_file( - *storage.path.split_bucket_name_from_gs_path(self.absolute_path), local_path=temporary_local_path + local_path=temporary_local_path, gs_path=self.absolute_path ) TEMPORARY_LOCAL_FILE_CACHE[self.absolute_path] = temporary_local_path @@ -407,16 +424,20 @@ def _calculate_hash(self): return super()._calculate_hash(hash) - def _get_cloud_location(self, project_name=None, bucket_name=None, path_in_bucket=None): + def _get_cloud_location(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): """Get the cloud location details for the bucket, allowing the keyword arguments to override any stored values. :param str|None project_name: + :param str|None gs_path: :param str|None bucket_name: :param str|None path_in_bucket: :raise octue.exceptions.CloudLocationNotSpecified: if an exact cloud location isn't provided and isn't available implicitly (i.e. the Datafile wasn't loaded from the cloud previously) :return (str, str, str): """ + if gs_path: + bucket_name, path_in_bucket = storage.path.split_bucket_name_from_gs_path(gs_path) + try: project_name = project_name or self._cloud_metadata["project_name"] bucket_name = bucket_name or self._cloud_metadata["bucket_name"] diff --git a/octue/resources/dataset.py b/octue/resources/dataset.py index 406a4b87d..fc5d5f358 100644 --- a/octue/resources/dataset.py +++ b/octue/resources/dataset.py @@ -55,18 +55,20 @@ def __len__(self): return len(self.files) @classmethod - def from_cloud(cls, project_name, bucket_name, path_to_dataset_directory): + def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_dataset_directory=None): """Instantiate a Dataset from Google Cloud storage. :param str project_name: - :param str bucket_name: - :param str path_to_dataset_directory: path to dataset directory (directory containing dataset's files) + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_to_dataset_directory: path to dataset directory (directory containing dataset's files) :return Dataset: """ - storage_client = GoogleCloudStorageClient(project_name=project_name) + if gs_path: + bucket_name, path_to_dataset_directory = storage.path.split_bucket_name_from_gs_path(gs_path) serialised_dataset = json.loads( - storage_client.download_as_string( + GoogleCloudStorageClient(project_name=project_name).download_as_string( bucket_name=bucket_name, path_in_bucket=storage.path.join(path_to_dataset_directory, definitions.DATASET_FILENAME), ) @@ -89,19 +91,25 @@ def from_cloud(cls, project_name, bucket_name, path_to_dataset_directory): files=datafiles, ) - def to_cloud(self, project_name, bucket_name, output_directory): + def to_cloud(self, project_name, gs_path=None, bucket_name=None, output_directory=None): """Upload a dataset to a cloud location. :param str project_name: - :param str bucket_name: - :param str output_directory: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None output_directory: :return str: gs:// path for dataset """ + if gs_path: + bucket_name, output_directory = storage.path.split_bucket_name_from_gs_path(gs_path) + files = [] for datafile in self.files: datafile_path = datafile.to_cloud( - project_name, bucket_name, path_in_bucket=storage.path.join(output_directory, self.name, datafile.name) + project_name, + bucket_name=bucket_name, + path_in_bucket=storage.path.join(output_directory, self.name, datafile.name), ) files.append(datafile_path) @@ -116,7 +124,7 @@ def to_cloud(self, project_name, bucket_name, output_directory): path_in_bucket=storage.path.join(output_directory, self.name, definitions.DATASET_FILENAME), ) - return storage.path.generate_gs_path(bucket_name, output_directory, self.name) + return gs_path or storage.path.generate_gs_path(bucket_name, output_directory, self.name) @property def name(self): diff --git a/octue/resources/manifest.py b/octue/resources/manifest.py index d5224fa8e..d389cb1fd 100644 --- a/octue/resources/manifest.py +++ b/octue/resources/manifest.py @@ -45,18 +45,22 @@ def __init__(self, id=None, logger=None, path=None, datasets=None, keys=None, ** vars(self).update(**kwargs) @classmethod - def from_cloud(cls, project_name, bucket_name, path_to_manifest_file): + def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_manifest_file=None): """Instantiate a Manifest from Google Cloud storage. :param str project_name: - :param str bucket_name: - :param str path_to_manifest_file: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_to_manifest_file: :return Dataset: """ - storage_client = GoogleCloudStorageClient(project_name=project_name) + if gs_path: + bucket_name, path_to_manifest_file = storage.path.split_bucket_name_from_gs_path(gs_path) serialised_manifest = json.loads( - storage_client.download_as_string(bucket_name=bucket_name, path_in_bucket=path_to_manifest_file) + GoogleCloudStorageClient(project_name=project_name).download_as_string( + bucket_name=bucket_name, path_in_bucket=path_to_manifest_file + ) ) datasets = [] @@ -77,23 +81,31 @@ def from_cloud(cls, project_name, bucket_name, path_to_manifest_file): keys=serialised_manifest["keys"], ) - def to_cloud(self, project_name, bucket_name, path_to_manifest_file, store_datasets=True): + def to_cloud(self, project_name, gs_path=None, bucket_name=None, path_to_manifest_file=None, store_datasets=True): """Upload a manifest to a cloud location, optionally uploading its datasets into the same directory. :param str project_name: - :param str bucket_name: - :param str path_to_manifest_file: + :param str|None gs_path: + :param str|None bucket_name: + :param str|None path_to_manifest_file: :param bool store_datasets: if True, upload datasets to same directory as manifest file :return str: gs:// path for manifest file """ + if gs_path: + bucket_name, path_to_manifest_file = storage.path.split_bucket_name_from_gs_path(gs_path) + datasets = [] output_directory = storage.path.dirname(path_to_manifest_file) for dataset in self.datasets: if store_datasets: - dataset_path = dataset.to_cloud(project_name, bucket_name, output_directory=output_directory) + dataset_path = dataset.to_cloud( + project_name, bucket_name=bucket_name, output_directory=output_directory + ) + datasets.append(dataset_path) + else: datasets.append(dataset.absolute_path) @@ -107,7 +119,7 @@ def to_cloud(self, project_name, bucket_name, path_to_manifest_file, store_datas path_in_bucket=path_to_manifest_file, ) - return storage.path.generate_gs_path(bucket_name, path_to_manifest_file) + return gs_path or storage.path.generate_gs_path(bucket_name, path_to_manifest_file) @property def all_datasets_are_in_cloud(self): diff --git a/tests/resources/test_datafile.py b/tests/resources/test_datafile.py index 7a62a88c2..fa3e4b0f4 100644 --- a/tests/resources/test_datafile.py +++ b/tests/resources/test_datafile.py @@ -203,10 +203,10 @@ def test_from_cloud_with_datafile(self): sequence=1, tags={"blah:shah:nah", "blib", "glib"}, ) + gs_path = f"gs://{TEST_BUCKET_NAME}/{path_in_bucket}" + downloaded_datafile = Datafile.from_cloud(project_name, gs_path=gs_path) - downloaded_datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) - - self.assertEqual(downloaded_datafile.path, f"gs://{TEST_BUCKET_NAME}/{path_in_bucket}") + self.assertEqual(downloaded_datafile.path, gs_path) self.assertEqual(downloaded_datafile.id, datafile.id) self.assertEqual(downloaded_datafile.timestamp, datafile.timestamp) self.assertEqual(downloaded_datafile.hash_value, datafile.hash_value) @@ -257,9 +257,11 @@ def test_to_cloud_updates_cloud_metadata(self): datafile, project_name, bucket_name, path_in_bucket, _ = self.create_datafile_in_cloud(cluster=0) datafile.cluster = 3 - datafile.to_cloud(project_name, bucket_name, path_in_bucket) + datafile.to_cloud(project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket) - self.assertEqual(Datafile.from_cloud(project_name, bucket_name, path_in_bucket).cluster, 3) + self.assertEqual( + Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket).cluster, 3 + ) def test_to_cloud_does_not_update_cloud_metadata_if_update_cloud_metadata_is_false(self): """Test that calling Datafile.to_cloud with `update_cloud_metadata=False` doesn't update the cloud metadata.""" @@ -267,16 +269,20 @@ def test_to_cloud_does_not_update_cloud_metadata_if_update_cloud_metadata_is_fal datafile.cluster = 3 with patch("octue.resources.datafile.Datafile.update_cloud_metadata") as mock: - datafile.to_cloud(project_name, bucket_name, path_in_bucket, update_cloud_metadata=False) + datafile.to_cloud( + project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket, update_cloud_metadata=False + ) self.assertFalse(mock.called) - self.assertEqual(Datafile.from_cloud(project_name, bucket_name, path_in_bucket).cluster, 0) + self.assertEqual( + Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket).cluster, 0 + ) def test_to_cloud_does_not_update_metadata_if_no_metadata_change_has_been_made(self): """Test that Datafile.to_cloud does not try to update cloud metadata if no metadata change has been made.""" _, project_name, bucket_name, path_in_bucket, _ = self.create_datafile_in_cloud(cluster=0) - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) with patch("octue.resources.datafile.Datafile.update_cloud_metadata") as mock: datafile.to_cloud() @@ -296,7 +302,7 @@ def test_to_cloud_works_with_implicit_cloud_location_if_cloud_location_previousl provided. """ _, project_name, bucket_name, path_in_bucket, _ = self.create_datafile_in_cloud() - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) datafile.to_cloud() def test_to_cloud_does_not_try_to_update_file_if_no_change_has_been_made_locally(self): @@ -312,9 +318,11 @@ def test_update_cloud_metadata(self): _, project_name, bucket_name, path_in_bucket, _ = self.create_datafile_in_cloud() new_datafile = Datafile(path="glib.txt", cluster=32) - new_datafile.update_cloud_metadata(project_name, bucket_name, path_in_bucket) + new_datafile.update_cloud_metadata(project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket) - self.assertEqual(Datafile.from_cloud(project_name, bucket_name, path_in_bucket).cluster, 32) + self.assertEqual( + Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket).cluster, 32 + ) def test_update_cloud_metadata_works_with_implicit_cloud_location_if_cloud_location_previously_provided(self): """Test that datafile.update_metadata works with an implicit cloud location if the cloud location has been @@ -322,11 +330,13 @@ def test_update_cloud_metadata_works_with_implicit_cloud_location_if_cloud_locat """ _, project_name, bucket_name, path_in_bucket, _ = self.create_datafile_in_cloud() - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) datafile.cluster = 32 datafile.update_cloud_metadata() - self.assertEqual(Datafile.from_cloud(project_name, bucket_name, path_in_bucket).cluster, 32) + self.assertEqual( + Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket).cluster, 32 + ) def test_update_cloud_metadata_raises_error_if_no_cloud_location_provided_and_datafile_not_from_cloud(self): """Test that trying to update a cloud datafile's metadata with no cloud location provided when the datafile was @@ -340,7 +350,7 @@ def test_update_cloud_metadata_raises_error_if_no_cloud_location_provided_and_da def test_get_local_path(self): """Test that a file in the cloud can be temporarily downloaded and its local path returned.""" _, project_name, bucket_name, path_in_bucket, contents = self.create_datafile_in_cloud() - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) with open(datafile.get_local_path()) as f: self.assertEqual(f.read(), contents) @@ -348,7 +358,7 @@ def test_get_local_path(self): def test_get_local_path_with_cached_file_avoids_downloading_again(self): """Test that attempting to download a cached file avoids downloading it again.""" _, project_name, bucket_name, path_in_bucket, _ = self.create_datafile_in_cloud() - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) # Download for first time. datafile.get_local_path() @@ -388,7 +398,7 @@ def test_open_with_writing_local_file(self): def test_open_with_reading_cloud_file(self): """Test that a cloud datafile can be opened for reading.""" _, project_name, bucket_name, path_in_bucket, contents = self.create_datafile_in_cloud() - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) with datafile.open() as f: self.assertEqual(f.read(), contents) @@ -396,7 +406,7 @@ def test_open_with_reading_cloud_file(self): def test_open_with_writing_to_cloud_file(self): """Test that a cloud datafile can be opened for writing and that both the remote and local copies are updated.""" _, project_name, bucket_name, path_in_bucket, original_contents = self.create_datafile_in_cloud() - datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + datafile = Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket) new_file_contents = "nanana" @@ -499,12 +509,17 @@ def test_from_datafile_as_context_manager(self): new_contents = "Here is the new content." self.assertNotEqual(original_content, new_contents) - with Datafile.from_cloud(project_name, bucket_name, path_in_bucket, mode="w") as (datafile, f): + with Datafile.from_cloud(project_name, bucket_name=bucket_name, datafile_path=path_in_bucket, mode="w") as ( + datafile, + f, + ): datafile.add_tags("blue") f.write(new_contents) # Check that the cloud metadata has been updated. - re_downloaded_datafile = Datafile.from_cloud(project_name, bucket_name, path_in_bucket) + re_downloaded_datafile = Datafile.from_cloud( + project_name, bucket_name=bucket_name, datafile_path=path_in_bucket + ) self.assertTrue("blue" in re_downloaded_datafile.tags) # The file cache must be cleared so the modified cloud file is downloaded. diff --git a/tests/resources/test_dataset.py b/tests/resources/test_dataset.py index 96660f1df..43de56ad9 100644 --- a/tests/resources/test_dataset.py +++ b/tests/resources/test_dataset.py @@ -363,7 +363,7 @@ def test_to_cloud(self): } ) - dataset.to_cloud(project_name, TEST_BUCKET_NAME, output_directory) + dataset.to_cloud(project_name, bucket_name=TEST_BUCKET_NAME, output_directory=output_directory) storage_client = GoogleCloudStorageClient(project_name) diff --git a/tests/resources/test_manifest.py b/tests/resources/test_manifest.py index 3ae3c6acd..0ead268fc 100644 --- a/tests/resources/test_manifest.py +++ b/tests/resources/test_manifest.py @@ -79,7 +79,7 @@ def test_to_cloud(self): manifest.to_cloud( self.TEST_PROJECT_NAME, - TEST_BUCKET_NAME, + bucket_name=TEST_BUCKET_NAME, path_to_manifest_file=storage.path.join("blah", "manifest.json"), ) @@ -118,7 +118,7 @@ def test_to_cloud_without_storing_datasets(self): manifest.to_cloud( self.TEST_PROJECT_NAME, - TEST_BUCKET_NAME, + bucket_name=TEST_BUCKET_NAME, path_to_manifest_file=storage.path.join("my-manifests", "manifest.json"), store_datasets=False, ) @@ -156,7 +156,7 @@ def test_from_cloud(self): manifest = Manifest(datasets=[dataset], keys={"my-dataset": 0}) manifest.to_cloud( self.TEST_PROJECT_NAME, - TEST_BUCKET_NAME, + bucket_name=TEST_BUCKET_NAME, path_to_manifest_file=storage.path.join("my-directory", "manifest.json"), ) From a12f69fbee6aecccccfbe663f1397ba60d76a8f2 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 15:41:25 +0100 Subject: [PATCH 2/8] DOC: Add info on gs_path to docstrings --- octue/cloud/storage/client.py | 24 ++++++++++++++++-------- octue/resources/datafile.py | 10 ++++++++-- octue/resources/dataset.py | 6 ++++-- octue/resources/manifest.py | 6 ++++-- 4 files changed, 32 insertions(+), 14 deletions(-) diff --git a/octue/cloud/storage/client.py b/octue/cloud/storage/client.py index 32df9500b..26f04ce9a 100644 --- a/octue/cloud/storage/client.py +++ b/octue/cloud/storage/client.py @@ -50,7 +50,8 @@ def create_bucket(self, name, location=None, allow_existing=False, timeout=_DEFA def upload_file( self, local_path, gs_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT ): - """Upload a local file to a Google Cloud bucket at gs:///. + """Upload a local file to a Google Cloud bucket at gs:///. Either (`bucket_name` + and `path_in_bucket`) or `gs_path` must be provided. :param str local_path: :param str|None gs_path: @@ -73,7 +74,7 @@ def upload_from_string( self, string, gs_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT ): """Upload serialised data in string form to a file in a Google Cloud bucket at - gs:///. + gs:///. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. :param str string: :param str|None gs_path: @@ -91,7 +92,8 @@ def upload_from_string( logger.info("Uploaded data to Google Cloud at %r.", blob.public_url) def update_metadata(self, metadata, gs_path=None, bucket_name=None, path_in_bucket=None): - """Update the metadata for the given cloud file. + """Update the metadata for the given cloud file. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must + be provided. :param dict metadata: :param str|None gs_path: @@ -105,7 +107,8 @@ def update_metadata(self, metadata, gs_path=None, bucket_name=None, path_in_buck def download_to_file( self, local_path, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT ): - """Download a file to a file from a Google Cloud bucket at gs:///. + """Download a file to a file from a Google Cloud bucket at gs:///. Either + (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. :param str local_path: :param str|None gs_path: @@ -119,7 +122,8 @@ def download_to_file( logger.info("Downloaded %r from Google Cloud to %r.", blob.public_url, local_path) def download_as_string(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): - """Download a file to a string from a Google Cloud bucket at gs:///. + """Download a file to a string from a Google Cloud bucket at gs:///. Either + (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. :param str|None gs_path: :param str|None bucket_name: @@ -133,7 +137,8 @@ def download_as_string(self, gs_path=None, bucket_name=None, path_in_bucket=None return data.decode() def get_metadata(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): - """Get the metadata of the given file in the given bucket. + """Get the metadata of the given file in the given bucket. Either (`bucket_name` and `path_in_bucket`) or + `gs_path` must be provided. :param str|None gs_path: :param str|None bucket_name: @@ -164,7 +169,8 @@ def get_metadata(self, gs_path=None, bucket_name=None, path_in_bucket=None, time } def delete(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): - """Delete the given file from the given bucket. + """Delete the given file from the given bucket. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must + be provided. :param str|None gs_path: :param str|None bucket_name: @@ -177,7 +183,8 @@ def delete(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_D logger.info("Deleted %r from Google Cloud.", blob.public_url) def scandir(self, gs_path=None, bucket_name=None, directory_path=None, filter=None, timeout=_DEFAULT_TIMEOUT): - """Yield the blobs belonging to the given "directory" in the given bucket. + """Yield the blobs belonging to the given "directory" in the given bucket. Either (`bucket_name` and + `path_in_bucket`) or `gs_path` must be provided. :param str|None gs_path: :param str|None bucket_name: @@ -208,6 +215,7 @@ def _strip_leading_slash(self, path): def _blob(self, gs_path=None, bucket_name=None, path_in_bucket=None): """Instantiate a blob for the given bucket at the given path. Note that this is not synced up with Google Cloud. + Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. :param str|None gs_path: :param str|None bucket_name: diff --git a/octue/resources/datafile.py b/octue/resources/datafile.py index 2c636280f..adee22a01 100644 --- a/octue/resources/datafile.py +++ b/octue/resources/datafile.py @@ -175,6 +175,8 @@ def from_cloud( Note that a value provided for an attribute in kwargs will override any existing value for the attribute. + Either (`bucket_name` and `datafile_path`) or `gs_path` must be provided. + :param str project_name: :param str|None gs_path: :param str|None bucket_name: @@ -218,7 +220,8 @@ def from_cloud( def to_cloud( self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None, update_cloud_metadata=True ): - """Upload a datafile to Google Cloud Storage. + """Upload a datafile to Google Cloud Storage. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be + provided. :param str|None project_name: :param str|None gs_path: @@ -253,6 +256,7 @@ def to_cloud( def get_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): """Get the cloud metadata for the datafile, casting the types of the cluster and sequence fields to integer. + Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. :param str|None project_name: :param str|None gs_path: @@ -286,7 +290,8 @@ def get_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, self._cloud_metadata = cloud_metadata def update_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): - """Update the cloud metadata for the datafile. + """Update the cloud metadata for the datafile. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be + provided. :param str|None project_name: :param str|None gs_path: @@ -426,6 +431,7 @@ def _calculate_hash(self): def _get_cloud_location(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): """Get the cloud location details for the bucket, allowing the keyword arguments to override any stored values. + Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. :param str|None project_name: :param str|None gs_path: diff --git a/octue/resources/dataset.py b/octue/resources/dataset.py index fc5d5f358..aa5e140bb 100644 --- a/octue/resources/dataset.py +++ b/octue/resources/dataset.py @@ -56,7 +56,8 @@ def __len__(self): @classmethod def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_dataset_directory=None): - """Instantiate a Dataset from Google Cloud storage. + """Instantiate a Dataset from Google Cloud storage. Either (`bucket_name` and `path_to_dataset_directory`) or + `gs_path` must be provided. :param str project_name: :param str|None gs_path: @@ -92,7 +93,8 @@ def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_datase ) def to_cloud(self, project_name, gs_path=None, bucket_name=None, output_directory=None): - """Upload a dataset to a cloud location. + """Upload a dataset to a cloud location. Either (`bucket_name` and `output_directory`) or `gs_path` must be + provided. :param str project_name: :param str|None gs_path: diff --git a/octue/resources/manifest.py b/octue/resources/manifest.py index d389cb1fd..051c7b4c9 100644 --- a/octue/resources/manifest.py +++ b/octue/resources/manifest.py @@ -46,7 +46,8 @@ def __init__(self, id=None, logger=None, path=None, datasets=None, keys=None, ** @classmethod def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_manifest_file=None): - """Instantiate a Manifest from Google Cloud storage. + """Instantiate a Manifest from Google Cloud storage. Either (`bucket_name` and `path_to_manifest_file`) or + `gs_path` must be provided. :param str project_name: :param str|None gs_path: @@ -82,7 +83,8 @@ def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_manife ) def to_cloud(self, project_name, gs_path=None, bucket_name=None, path_to_manifest_file=None, store_datasets=True): - """Upload a manifest to a cloud location, optionally uploading its datasets into the same directory. + """Upload a manifest to a cloud location, optionally uploading its datasets into the same directory. Either + (`bucket_name` and `path_to_manifest_file`) or `gs_path` must be provided. :param str project_name: :param str|None gs_path: From d07b80985d4a24834bc3193692de69a039fcb319 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 15:55:51 +0100 Subject: [PATCH 3/8] FIX: Use correct variable name --- octue/cloud/storage/client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/octue/cloud/storage/client.py b/octue/cloud/storage/client.py index 26f04ce9a..7a6e93eb9 100644 --- a/octue/cloud/storage/client.py +++ b/octue/cloud/storage/client.py @@ -194,7 +194,7 @@ def scandir(self, gs_path=None, bucket_name=None, directory_path=None, filter=No :yield google.cloud.storage.blob.Blob: """ if gs_path: - bucket_name, path_in_bucket = split_bucket_name_from_gs_path(gs_path) + bucket_name, directory_path = split_bucket_name_from_gs_path(gs_path) bucket = self.client.get_bucket(bucket_or_name=bucket_name) blobs = bucket.list_blobs(timeout=timeout) From b7c55f9e1bac638f3fb424fad08a91710e4373b3 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 16:27:28 +0100 Subject: [PATCH 4/8] TST: Test cloud integrations with gs_path --- tests/cloud/storage/test_client.py | 20 ++++++ tests/resources/test_dataset.py | 103 +++++++++++++++++------------ tests/resources/test_manifest.py | 64 ++++++++++-------- 3 files changed, 117 insertions(+), 70 deletions(-) diff --git a/tests/cloud/storage/test_client.py b/tests/cloud/storage/test_client.py index 3e5ef3308..aa3b97158 100644 --- a/tests/cloud/storage/test_client.py +++ b/tests/cloud/storage/test_client.py @@ -170,6 +170,18 @@ def test_scandir(self): self.assertEqual(len(contents), 1) self.assertEqual(contents[0].name, storage.path.join(directory_path, self.FILENAME)) + def test_scandir_with_gs_path(self): + """Test that Google Cloud storage "directories"' contents can be listed when a GS path is used.""" + directory_path = storage.path.join("my", "path") + path_in_bucket = storage.path.join(directory_path, self.FILENAME) + gs_path = f"gs://{TEST_BUCKET_NAME}/{path_in_bucket}" + + self.storage_client.upload_from_string(string=json.dumps({"height": 32}), gs_path=gs_path) + contents = list(self.storage_client.scandir(gs_path)) + + self.assertEqual(len(contents), 1) + self.assertEqual(contents[0].name, storage.path.join(directory_path, self.FILENAME)) + def test_scandir_with_empty_directory(self): """Test that an empty directory shows as such.""" directory_path = storage.path.join("another", "path") @@ -190,3 +202,11 @@ def test_get_metadata(self): ) self.assertTrue(len(metadata) > 0) + + def test_get_metadata_with_gs_path(self): + """Test that file metadata can be retrieved when a GS path is used.""" + gs_path = f"gs://{TEST_BUCKET_NAME}/{self.FILENAME}" + self.storage_client.upload_from_string(string=json.dumps({"height": 32}), gs_path=gs_path) + + metadata = self.storage_client.get_metadata(gs_path) + self.assertTrue(len(metadata) > 0) diff --git a/tests/resources/test_dataset.py b/tests/resources/test_dataset.py index 43de56ad9..89164f819 100644 --- a/tests/resources/test_dataset.py +++ b/tests/resources/test_dataset.py @@ -300,9 +300,9 @@ def test_is_in_cloud(self): self.assertTrue(Dataset(files=files).all_files_are_in_cloud) def test_from_cloud(self): - """Test that a Dataset in cloud storage can be accessed.""" - project_name = "test-project" - + """Test that a Dataset in cloud storage can be accessed via (`bucket_name`, `output_directory`) and via + `gs_path`. + """ with tempfile.TemporaryDirectory() as temporary_directory: file_0_path = os.path.join(temporary_directory, "file_0.txt") file_1_path = os.path.join(temporary_directory, "file_1.txt") @@ -321,31 +321,37 @@ def test_from_cloud(self): }, ) + project_name = "test-project" dataset.to_cloud(project_name=project_name, bucket_name=TEST_BUCKET_NAME, output_directory="a_directory") - persisted_dataset = Dataset.from_cloud( - project_name=project_name, - bucket_name=TEST_BUCKET_NAME, - path_to_dataset_directory=storage.path.join("a_directory", dataset.name), - ) + bucket_name = TEST_BUCKET_NAME + path_to_dataset_directory = storage.path.join("a_directory", dataset.name) + gs_path = f"gs://{bucket_name}/{path_to_dataset_directory}" + + for location_parameters in ( + {"bucket_name": bucket_name, "path_to_dataset_directory": path_to_dataset_directory, "gs_path": None}, + {"bucket_name": None, "path_to_dataset_directory": None, "gs_path": gs_path}, + ): - self.assertEqual(persisted_dataset.path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}") - self.assertEqual(persisted_dataset.id, dataset.id) - self.assertEqual(persisted_dataset.name, dataset.name) - self.assertEqual(persisted_dataset.hash_value, dataset.hash_value) - self.assertEqual(persisted_dataset.tags, dataset.tags) - self.assertEqual({file.name for file in persisted_dataset.files}, {file.name for file in dataset.files}) + persisted_dataset = Dataset.from_cloud( + project_name=project_name, + **location_parameters, + ) + + self.assertEqual(persisted_dataset.path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}") + self.assertEqual(persisted_dataset.id, dataset.id) + self.assertEqual(persisted_dataset.name, dataset.name) + self.assertEqual(persisted_dataset.hash_value, dataset.hash_value) + self.assertEqual(persisted_dataset.tags, dataset.tags) + self.assertEqual({file.name for file in persisted_dataset.files}, {file.name for file in dataset.files}) - for file in persisted_dataset: - self.assertEqual(file.path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}/{file.name}") + for file in persisted_dataset: + self.assertEqual(file.path, f"gs://{TEST_BUCKET_NAME}/a_directory/{dataset.name}/{file.name}") def test_to_cloud(self): - """Test that a dataset can be uploaded to the cloud, including all its files and a serialised JSON file of the - Datafile instance. + """Test that a dataset can be uploaded to the cloud via (`bucket_name`, `output_directory`) and via `gs_path`, + including all its files and a serialised JSON file of the Datafile instance. """ - project_name = "test-project" - output_directory = "my_datasets" - with tempfile.TemporaryDirectory() as temporary_directory: file_0_path = os.path.join(temporary_directory, "file_0.txt") file_1_path = os.path.join(temporary_directory, "file_1.txt") @@ -363,34 +369,43 @@ def test_to_cloud(self): } ) - dataset.to_cloud(project_name, bucket_name=TEST_BUCKET_NAME, output_directory=output_directory) + project_name = "test-project" + bucket_name = TEST_BUCKET_NAME + output_directory = "my_datasets" + gs_path = storage.path.generate_gs_path(bucket_name, output_directory) - storage_client = GoogleCloudStorageClient(project_name) + for location_parameters in ( + {"bucket_name": bucket_name, "output_directory": output_directory, "gs_path": None}, + {"bucket_name": None, "output_directory": None, "gs_path": gs_path}, + ): + dataset.to_cloud(project_name, **location_parameters) - persisted_file_0 = storage_client.download_as_string( - bucket_name=TEST_BUCKET_NAME, - path_in_bucket=storage.path.join(output_directory, dataset.name, "file_0.txt"), - ) + storage_client = GoogleCloudStorageClient(project_name) - self.assertEqual(persisted_file_0, "[1, 2, 3]") + persisted_file_0 = storage_client.download_as_string( + bucket_name=TEST_BUCKET_NAME, + path_in_bucket=storage.path.join(output_directory, dataset.name, "file_0.txt"), + ) - persisted_file_1 = storage_client.download_as_string( - bucket_name=TEST_BUCKET_NAME, - path_in_bucket=storage.path.join(output_directory, dataset.name, "file_1.txt"), - ) - self.assertEqual(persisted_file_1, "[4, 5, 6]") + self.assertEqual(persisted_file_0, "[1, 2, 3]") - persisted_dataset = json.loads( - storage_client.download_as_string( + persisted_file_1 = storage_client.download_as_string( bucket_name=TEST_BUCKET_NAME, - path_in_bucket=storage.path.join(output_directory, dataset.name, "dataset.json"), + path_in_bucket=storage.path.join(output_directory, dataset.name, "file_1.txt"), ) - ) + self.assertEqual(persisted_file_1, "[4, 5, 6]") - self.assertEqual( - persisted_dataset["files"], - [ - "gs://octue-test-bucket/my_datasets/octue-sdk-python/file_0.txt", - "gs://octue-test-bucket/my_datasets/octue-sdk-python/file_1.txt", - ], - ) + persisted_dataset = json.loads( + storage_client.download_as_string( + bucket_name=TEST_BUCKET_NAME, + path_in_bucket=storage.path.join(output_directory, dataset.name, "dataset.json"), + ) + ) + + self.assertEqual( + persisted_dataset["files"], + [ + "gs://octue-test-bucket/my_datasets/octue-sdk-python/file_0.txt", + "gs://octue-test-bucket/my_datasets/octue-sdk-python/file_1.txt", + ], + ) diff --git a/tests/resources/test_manifest.py b/tests/resources/test_manifest.py index 0ead268fc..79cd00e27 100644 --- a/tests/resources/test_manifest.py +++ b/tests/resources/test_manifest.py @@ -56,7 +56,9 @@ def test_deserialise(self): self.assertEqual(original_dataset.absolute_path, deserialised_dataset.absolute_path) def test_to_cloud(self): - """Test that a manifest can be uploaded to the cloud as a serialised JSON file of the Manifest instance. """ + """Test that a manifest can be uploaded to the cloud as a serialised JSON file of the Manifest instance via + (`bucket_name`, `output_directory`) and via `gs_path`. + """ with tempfile.TemporaryDirectory() as temporary_directory: file_0_path = os.path.join(temporary_directory, "file_0.txt") file_1_path = os.path.join(temporary_directory, "file_1.txt") @@ -77,11 +79,15 @@ def test_to_cloud(self): manifest = Manifest(datasets=[dataset], keys={"my-dataset": 0}) - manifest.to_cloud( - self.TEST_PROJECT_NAME, - bucket_name=TEST_BUCKET_NAME, - path_to_manifest_file=storage.path.join("blah", "manifest.json"), - ) + bucket_name = TEST_BUCKET_NAME + path_to_manifest_file = storage.path.join("blah", "manifest.json") + gs_path = storage.path.generate_gs_path(bucket_name, path_to_manifest_file) + + for location_parameters in ( + {"bucket_name": bucket_name, "path_to_manifest_file": path_to_manifest_file, "gs_path": None}, + {"bucket_name": None, "path_to_manifest_file": None, "gs_path": gs_path}, + ): + manifest.to_cloud(self.TEST_PROJECT_NAME, **location_parameters) persisted_manifest = json.loads( GoogleCloudStorageClient(self.TEST_PROJECT_NAME).download_as_string( @@ -134,7 +140,9 @@ def test_to_cloud_without_storing_datasets(self): self.assertEqual(persisted_manifest["keys"], {"my-dataset": 0}) def test_from_cloud(self): - """Test that a Manifest can be instantiated from the cloud.""" + """Test that a Manifest can be instantiated from the cloud via (`bucket_name`, `output_directory`) and via + `gs_path`. + """ with tempfile.TemporaryDirectory() as temporary_directory: file_0_path = os.path.join(temporary_directory, "file_0.txt") file_1_path = os.path.join(temporary_directory, "file_1.txt") @@ -160,22 +168,26 @@ def test_from_cloud(self): path_to_manifest_file=storage.path.join("my-directory", "manifest.json"), ) - persisted_manifest = Manifest.from_cloud( - project_name=self.TEST_PROJECT_NAME, - bucket_name=TEST_BUCKET_NAME, - path_to_manifest_file=storage.path.join("my-directory", "manifest.json"), - ) - - self.assertEqual(persisted_manifest.path, f"gs://{TEST_BUCKET_NAME}/my-directory/manifest.json") - self.assertEqual(persisted_manifest.id, manifest.id) - self.assertEqual(persisted_manifest.hash_value, manifest.hash_value) - self.assertEqual(persisted_manifest.keys, manifest.keys) - self.assertEqual( - {dataset.name for dataset in persisted_manifest.datasets}, - {dataset.name for dataset in manifest.datasets}, - ) - - for dataset in persisted_manifest.datasets: - self.assertEqual(dataset.path, f"gs://{TEST_BUCKET_NAME}/my-directory/{dataset.name}") - self.assertTrue(len(dataset.files), 2) - self.assertTrue(all(isinstance(file, Datafile) for file in dataset.files)) + bucket_name = TEST_BUCKET_NAME + path_to_manifest_file = storage.path.join("my-directory", "manifest.json") + gs_path = storage.path.generate_gs_path(bucket_name, path_to_manifest_file) + + for location_parameters in ( + {"bucket_name": bucket_name, "path_to_manifest_file": path_to_manifest_file, "gs_path": None}, + {"bucket_name": None, "path_to_manifest_file": None, "gs_path": gs_path}, + ): + persisted_manifest = Manifest.from_cloud(project_name=self.TEST_PROJECT_NAME, **location_parameters) + + self.assertEqual(persisted_manifest.path, f"gs://{TEST_BUCKET_NAME}/my-directory/manifest.json") + self.assertEqual(persisted_manifest.id, manifest.id) + self.assertEqual(persisted_manifest.hash_value, manifest.hash_value) + self.assertEqual(persisted_manifest.keys, manifest.keys) + self.assertEqual( + {dataset.name for dataset in persisted_manifest.datasets}, + {dataset.name for dataset in manifest.datasets}, + ) + + for dataset in persisted_manifest.datasets: + self.assertEqual(dataset.path, f"gs://{TEST_BUCKET_NAME}/my-directory/{dataset.name}") + self.assertTrue(len(dataset.files), 2) + self.assertTrue(all(isinstance(file, Datafile) for file in dataset.files)) From a9b81ff2af12dbffa0f7da434855e92adda3a73c Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Tue, 25 May 2021 16:28:03 +0100 Subject: [PATCH 5/8] IMP: Add datetime encoding to OctueJSONEncoder --- octue/mixins/serialisable.py | 6 +++++- octue/utils/encoders.py | 5 +++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/octue/mixins/serialisable.py b/octue/mixins/serialisable.py index 304138ab0..cac0d4db2 100644 --- a/octue/mixins/serialisable.py +++ b/octue/mixins/serialisable.py @@ -78,8 +78,12 @@ def __init__(self): self_as_primitive = {} for name in names_of_attributes_to_serialise: attribute = getattr(self, name, None) + # Serialise sets as sorted list (JSON doesn't support sets). - self_as_primitive[name] = sorted(attribute) if isinstance(attribute, set) else attribute + if isinstance(attribute, set): + self_as_primitive[name] = sorted(attribute) + else: + self_as_primitive[name] = attribute # TODO this conversion backward-and-forward is very inefficient but allows us to use the same encoder for # converting the object to a dict as to strings, which ensures that nested attributes are also cast to diff --git a/octue/utils/encoders.py b/octue/utils/encoders.py index 416ed7701..94b39d19d 100644 --- a/octue/utils/encoders.py +++ b/octue/utils/encoders.py @@ -1,3 +1,5 @@ +import datetime + from twined.utils import TwinedEncoder @@ -10,5 +12,8 @@ def default(self, obj): if hasattr(obj, "serialise"): return obj.serialise() + if isinstance(obj, datetime.datetime): + return str(obj) + # Otherwise let the base class default method raise the TypeError return TwinedEncoder.default(self, obj) From 34c10cda5ad8492896e7ce00abd608246f3ea70b Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 2 Jun 2021 12:03:14 +0100 Subject: [PATCH 6/8] REF: Rename gs_path to cloud_path --- octue/cloud/storage/client.py | 84 ++++++++++++++++-------------- octue/resources/datafile.py | 52 +++++++++--------- octue/resources/dataset.py | 22 ++++---- octue/resources/manifest.py | 24 +++++---- tests/cloud/storage/test_client.py | 4 +- tests/resources/test_datafile.py | 2 +- tests/resources/test_dataset.py | 12 +++-- tests/resources/test_manifest.py | 8 +-- 8 files changed, 110 insertions(+), 98 deletions(-) diff --git a/octue/cloud/storage/client.py b/octue/cloud/storage/client.py index 7a6e93eb9..1d93b33fc 100644 --- a/octue/cloud/storage/client.py +++ b/octue/cloud/storage/client.py @@ -48,20 +48,26 @@ def create_bucket(self, name, location=None, allow_existing=False, timeout=_DEFA self.client.create_bucket(bucket_or_name=name, location=location, timeout=timeout) def upload_file( - self, local_path, gs_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT + self, + local_path, + cloud_path=None, + bucket_name=None, + path_in_bucket=None, + metadata=None, + timeout=_DEFAULT_TIMEOUT, ): """Upload a local file to a Google Cloud bucket at gs:///. Either (`bucket_name` - and `path_in_bucket`) or `gs_path` must be provided. + and `path_in_bucket`) or `cloud_path` must be provided. :param str local_path: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param dict metadata: :param float timeout: :return None: """ - blob = self._blob(gs_path, bucket_name, path_in_bucket) + blob = self._blob(cloud_path, bucket_name, path_in_bucket) with open(local_path) as f: blob.crc32c = self._compute_crc32c_checksum(f.read()) @@ -71,83 +77,83 @@ def upload_file( logger.info("Uploaded %r to Google Cloud at %r.", local_path, blob.public_url) def upload_from_string( - self, string, gs_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT + self, string, cloud_path=None, bucket_name=None, path_in_bucket=None, metadata=None, timeout=_DEFAULT_TIMEOUT ): """Upload serialised data in string form to a file in a Google Cloud bucket at - gs:///. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. + gs:///. Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param str string: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param dict metadata: :param float timeout: :return None: """ - blob = self._blob(gs_path, bucket_name, path_in_bucket) + blob = self._blob(cloud_path, bucket_name, path_in_bucket) blob.crc32c = self._compute_crc32c_checksum(string) blob.upload_from_string(data=string, timeout=timeout) self._update_metadata(blob, metadata) logger.info("Uploaded data to Google Cloud at %r.", blob.public_url) - def update_metadata(self, metadata, gs_path=None, bucket_name=None, path_in_bucket=None): - """Update the metadata for the given cloud file. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must + def update_metadata(self, metadata, cloud_path=None, bucket_name=None, path_in_bucket=None): + """Update the metadata for the given cloud file. Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param dict metadata: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :return None: """ - blob = self._blob(gs_path, bucket_name, path_in_bucket) + blob = self._blob(cloud_path, bucket_name, path_in_bucket) self._update_metadata(blob, metadata) def download_to_file( - self, local_path, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT + self, local_path, cloud_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT ): """Download a file to a file from a Google Cloud bucket at gs:///. Either - (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. + (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param str local_path: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param float timeout: :return None: """ - blob = self._blob(gs_path, bucket_name, path_in_bucket) + blob = self._blob(cloud_path, bucket_name, path_in_bucket) blob.download_to_filename(local_path, timeout=timeout) logger.info("Downloaded %r from Google Cloud to %r.", blob.public_url, local_path) - def download_as_string(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): + def download_as_string(self, cloud_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): """Download a file to a string from a Google Cloud bucket at gs:///. Either - (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. + (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param float timeout: :return str: """ - blob = self._blob(gs_path, bucket_name, path_in_bucket) + blob = self._blob(cloud_path, bucket_name, path_in_bucket) data = blob.download_as_bytes(timeout=timeout) logger.info("Downloaded %r from Google Cloud to as string.", blob.public_url) return data.decode() - def get_metadata(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): + def get_metadata(self, cloud_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): """Get the metadata of the given file in the given bucket. Either (`bucket_name` and `path_in_bucket`) or - `gs_path` must be provided. + `cloud_path` must be provided. - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param float timeout: :return dict: """ - if gs_path: - bucket_name, path_in_bucket = split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, path_in_bucket = split_bucket_name_from_gs_path(cloud_path) bucket = self.client.get_bucket(bucket_or_name=bucket_name) blob = bucket.get_blob(blob_name=self._strip_leading_slash(path_in_bucket), timeout=timeout) @@ -168,33 +174,33 @@ def get_metadata(self, gs_path=None, bucket_name=None, path_in_bucket=None, time "path_in_bucket": path_in_bucket, } - def delete(self, gs_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): - """Delete the given file from the given bucket. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must + def delete(self, cloud_path=None, bucket_name=None, path_in_bucket=None, timeout=_DEFAULT_TIMEOUT): + """Delete the given file from the given bucket. Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param float timeout: :return None: """ - blob = self._blob(gs_path, bucket_name, path_in_bucket) + blob = self._blob(cloud_path, bucket_name, path_in_bucket) blob.delete(timeout=timeout) logger.info("Deleted %r from Google Cloud.", blob.public_url) - def scandir(self, gs_path=None, bucket_name=None, directory_path=None, filter=None, timeout=_DEFAULT_TIMEOUT): + def scandir(self, cloud_path=None, bucket_name=None, directory_path=None, filter=None, timeout=_DEFAULT_TIMEOUT): """Yield the blobs belonging to the given "directory" in the given bucket. Either (`bucket_name` and - `path_in_bucket`) or `gs_path` must be provided. + `path_in_bucket`) or `cloud_path` must be provided. - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None directory_path: :param callable filter: :param float timeout: :yield google.cloud.storage.blob.Blob: """ - if gs_path: - bucket_name, directory_path = split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, directory_path = split_bucket_name_from_gs_path(cloud_path) bucket = self.client.get_bucket(bucket_or_name=bucket_name) blobs = bucket.list_blobs(timeout=timeout) @@ -213,17 +219,17 @@ def _strip_leading_slash(self, path): """ return path.lstrip("/") - def _blob(self, gs_path=None, bucket_name=None, path_in_bucket=None): + def _blob(self, cloud_path=None, bucket_name=None, path_in_bucket=None): """Instantiate a blob for the given bucket at the given path. Note that this is not synced up with Google Cloud. - Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. + Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :return google.cloud.storage.blob.Blob: """ - if gs_path: - bucket_name, path_in_bucket = split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, path_in_bucket = split_bucket_name_from_gs_path(cloud_path) bucket = self.client.get_bucket(bucket_or_name=bucket_name) return bucket.blob(blob_name=self._strip_leading_slash(path_in_bucket)) diff --git a/octue/resources/datafile.py b/octue/resources/datafile.py index adee22a01..35f529731 100644 --- a/octue/resources/datafile.py +++ b/octue/resources/datafile.py @@ -160,7 +160,7 @@ def deserialise(cls, serialised_datafile, path_from=None): def from_cloud( cls, project_name, - gs_path=None, + cloud_path=None, bucket_name=None, datafile_path=None, allow_overwrite=False, @@ -175,10 +175,10 @@ def from_cloud( Note that a value provided for an attribute in kwargs will override any existing value for the attribute. - Either (`bucket_name` and `datafile_path`) or `gs_path` must be provided. + Either (`bucket_name` and `datafile_path`) or `cloud_path` must be provided. :param str project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None datafile_path: path to file represented by datafile :param bool allow_overwrite: if `True`, allow attributes of the datafile to be overwritten by values given in @@ -189,11 +189,11 @@ def from_cloud( the datafile when the context is exited :return Datafile: """ - if not gs_path: - gs_path = storage.path.generate_gs_path(bucket_name, datafile_path) + if not cloud_path: + cloud_path = storage.path.generate_gs_path(bucket_name, datafile_path) - datafile = cls(path=gs_path) - datafile.get_cloud_metadata(project_name, gs_path=gs_path) + datafile = cls(path=cloud_path) + datafile.get_cloud_metadata(project_name, cloud_path=cloud_path) custom_metadata = datafile._cloud_metadata.get("custom_metadata", {}) if not allow_overwrite: @@ -218,20 +218,20 @@ def from_cloud( return datafile def to_cloud( - self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None, update_cloud_metadata=True + self, project_name=None, cloud_path=None, bucket_name=None, path_in_bucket=None, update_cloud_metadata=True ): - """Upload a datafile to Google Cloud Storage. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be + """Upload a datafile to Google Cloud Storage. Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param str|None project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :param bool update_cloud_metadata: :return str: gs:// path for datafile """ project_name, bucket_name, path_in_bucket = self._get_cloud_location( - project_name, gs_path, bucket_name, path_in_bucket + project_name, cloud_path, bucket_name, path_in_bucket ) self.get_cloud_metadata(project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket) @@ -252,20 +252,20 @@ def to_cloud( if self._cloud_metadata.get("custom_metadata") != local_metadata: self.update_cloud_metadata(project_name, bucket_name=bucket_name, path_in_bucket=path_in_bucket) - return gs_path or storage.path.generate_gs_path(bucket_name, path_in_bucket) + return cloud_path or storage.path.generate_gs_path(bucket_name, path_in_bucket) - def get_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): + def get_cloud_metadata(self, project_name=None, cloud_path=None, bucket_name=None, path_in_bucket=None): """Get the cloud metadata for the datafile, casting the types of the cluster and sequence fields to integer. - Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. + Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param str|None project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :return dict: """ project_name, bucket_name, path_in_bucket = self._get_cloud_location( - project_name, gs_path, bucket_name, path_in_bucket + project_name, cloud_path, bucket_name, path_in_bucket ) cloud_metadata = GoogleCloudStorageClient(project_name).get_metadata( @@ -289,18 +289,18 @@ def get_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, self._cloud_metadata = cloud_metadata - def update_cloud_metadata(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): - """Update the cloud metadata for the datafile. Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be + def update_cloud_metadata(self, project_name=None, cloud_path=None, bucket_name=None, path_in_bucket=None): + """Update the cloud metadata for the datafile. Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param str|None project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :return None: """ project_name, bucket_name, path_in_bucket = self._get_cloud_location( - project_name, gs_path, bucket_name, path_in_bucket + project_name, cloud_path, bucket_name, path_in_bucket ) GoogleCloudStorageClient(project_name=project_name).update_metadata( @@ -394,7 +394,7 @@ def get_local_path(self): temporary_local_path = tempfile.NamedTemporaryFile(delete=False).name GoogleCloudStorageClient(project_name=self._cloud_metadata["project_name"]).download_to_file( - local_path=temporary_local_path, gs_path=self.absolute_path + local_path=temporary_local_path, cloud_path=self.absolute_path ) TEMPORARY_LOCAL_FILE_CACHE[self.absolute_path] = temporary_local_path @@ -429,20 +429,20 @@ def _calculate_hash(self): return super()._calculate_hash(hash) - def _get_cloud_location(self, project_name=None, gs_path=None, bucket_name=None, path_in_bucket=None): + def _get_cloud_location(self, project_name=None, cloud_path=None, bucket_name=None, path_in_bucket=None): """Get the cloud location details for the bucket, allowing the keyword arguments to override any stored values. - Either (`bucket_name` and `path_in_bucket`) or `gs_path` must be provided. + Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. :param str|None project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_in_bucket: :raise octue.exceptions.CloudLocationNotSpecified: if an exact cloud location isn't provided and isn't available implicitly (i.e. the Datafile wasn't loaded from the cloud previously) :return (str, str, str): """ - if gs_path: - bucket_name, path_in_bucket = storage.path.split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, path_in_bucket = storage.path.split_bucket_name_from_gs_path(cloud_path) try: project_name = project_name or self._cloud_metadata["project_name"] diff --git a/octue/resources/dataset.py b/octue/resources/dataset.py index aa5e140bb..ef37bfbf2 100644 --- a/octue/resources/dataset.py +++ b/octue/resources/dataset.py @@ -55,18 +55,18 @@ def __len__(self): return len(self.files) @classmethod - def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_dataset_directory=None): + def from_cloud(cls, project_name, cloud_path=None, bucket_name=None, path_to_dataset_directory=None): """Instantiate a Dataset from Google Cloud storage. Either (`bucket_name` and `path_to_dataset_directory`) or - `gs_path` must be provided. + `cloud_path` must be provided. :param str project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_to_dataset_directory: path to dataset directory (directory containing dataset's files) :return Dataset: """ - if gs_path: - bucket_name, path_to_dataset_directory = storage.path.split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, path_to_dataset_directory = storage.path.split_bucket_name_from_gs_path(cloud_path) serialised_dataset = json.loads( GoogleCloudStorageClient(project_name=project_name).download_as_string( @@ -92,18 +92,18 @@ def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_datase files=datafiles, ) - def to_cloud(self, project_name, gs_path=None, bucket_name=None, output_directory=None): - """Upload a dataset to a cloud location. Either (`bucket_name` and `output_directory`) or `gs_path` must be + def to_cloud(self, project_name, cloud_path=None, bucket_name=None, output_directory=None): + """Upload a dataset to a cloud location. Either (`bucket_name` and `output_directory`) or `cloud_path` must be provided. :param str project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None output_directory: :return str: gs:// path for dataset """ - if gs_path: - bucket_name, output_directory = storage.path.split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, output_directory = storage.path.split_bucket_name_from_gs_path(cloud_path) files = [] @@ -126,7 +126,7 @@ def to_cloud(self, project_name, gs_path=None, bucket_name=None, output_director path_in_bucket=storage.path.join(output_directory, self.name, definitions.DATASET_FILENAME), ) - return gs_path or storage.path.generate_gs_path(bucket_name, output_directory, self.name) + return cloud_path or storage.path.generate_gs_path(bucket_name, output_directory, self.name) @property def name(self): diff --git a/octue/resources/manifest.py b/octue/resources/manifest.py index 051c7b4c9..8525fec30 100644 --- a/octue/resources/manifest.py +++ b/octue/resources/manifest.py @@ -45,18 +45,18 @@ def __init__(self, id=None, logger=None, path=None, datasets=None, keys=None, ** vars(self).update(**kwargs) @classmethod - def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_manifest_file=None): + def from_cloud(cls, project_name, cloud_path=None, bucket_name=None, path_to_manifest_file=None): """Instantiate a Manifest from Google Cloud storage. Either (`bucket_name` and `path_to_manifest_file`) or - `gs_path` must be provided. + `cloud_path` must be provided. :param str project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_to_manifest_file: :return Dataset: """ - if gs_path: - bucket_name, path_to_manifest_file = storage.path.split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, path_to_manifest_file = storage.path.split_bucket_name_from_gs_path(cloud_path) serialised_manifest = json.loads( GoogleCloudStorageClient(project_name=project_name).download_as_string( @@ -82,19 +82,21 @@ def from_cloud(cls, project_name, gs_path=None, bucket_name=None, path_to_manife keys=serialised_manifest["keys"], ) - def to_cloud(self, project_name, gs_path=None, bucket_name=None, path_to_manifest_file=None, store_datasets=True): + def to_cloud( + self, project_name, cloud_path=None, bucket_name=None, path_to_manifest_file=None, store_datasets=True + ): """Upload a manifest to a cloud location, optionally uploading its datasets into the same directory. Either - (`bucket_name` and `path_to_manifest_file`) or `gs_path` must be provided. + (`bucket_name` and `path_to_manifest_file`) or `cloud_path` must be provided. :param str project_name: - :param str|None gs_path: + :param str|None cloud_path: :param str|None bucket_name: :param str|None path_to_manifest_file: :param bool store_datasets: if True, upload datasets to same directory as manifest file :return str: gs:// path for manifest file """ - if gs_path: - bucket_name, path_to_manifest_file = storage.path.split_bucket_name_from_gs_path(gs_path) + if cloud_path: + bucket_name, path_to_manifest_file = storage.path.split_bucket_name_from_gs_path(cloud_path) datasets = [] output_directory = storage.path.dirname(path_to_manifest_file) @@ -121,7 +123,7 @@ def to_cloud(self, project_name, gs_path=None, bucket_name=None, path_to_manifes path_in_bucket=path_to_manifest_file, ) - return gs_path or storage.path.generate_gs_path(bucket_name, path_to_manifest_file) + return cloud_path or storage.path.generate_gs_path(bucket_name, path_to_manifest_file) @property def all_datasets_are_in_cloud(self): diff --git a/tests/cloud/storage/test_client.py b/tests/cloud/storage/test_client.py index aa3b97158..f17ce11ed 100644 --- a/tests/cloud/storage/test_client.py +++ b/tests/cloud/storage/test_client.py @@ -176,7 +176,7 @@ def test_scandir_with_gs_path(self): path_in_bucket = storage.path.join(directory_path, self.FILENAME) gs_path = f"gs://{TEST_BUCKET_NAME}/{path_in_bucket}" - self.storage_client.upload_from_string(string=json.dumps({"height": 32}), gs_path=gs_path) + self.storage_client.upload_from_string(string=json.dumps({"height": 32}), cloud_path=gs_path) contents = list(self.storage_client.scandir(gs_path)) self.assertEqual(len(contents), 1) @@ -206,7 +206,7 @@ def test_get_metadata(self): def test_get_metadata_with_gs_path(self): """Test that file metadata can be retrieved when a GS path is used.""" gs_path = f"gs://{TEST_BUCKET_NAME}/{self.FILENAME}" - self.storage_client.upload_from_string(string=json.dumps({"height": 32}), gs_path=gs_path) + self.storage_client.upload_from_string(string=json.dumps({"height": 32}), cloud_path=gs_path) metadata = self.storage_client.get_metadata(gs_path) self.assertTrue(len(metadata) > 0) diff --git a/tests/resources/test_datafile.py b/tests/resources/test_datafile.py index fa3e4b0f4..8426f156f 100644 --- a/tests/resources/test_datafile.py +++ b/tests/resources/test_datafile.py @@ -204,7 +204,7 @@ def test_from_cloud_with_datafile(self): tags={"blah:shah:nah", "blib", "glib"}, ) gs_path = f"gs://{TEST_BUCKET_NAME}/{path_in_bucket}" - downloaded_datafile = Datafile.from_cloud(project_name, gs_path=gs_path) + downloaded_datafile = Datafile.from_cloud(project_name, cloud_path=gs_path) self.assertEqual(downloaded_datafile.path, gs_path) self.assertEqual(downloaded_datafile.id, datafile.id) diff --git a/tests/resources/test_dataset.py b/tests/resources/test_dataset.py index 89164f819..c7fffdabf 100644 --- a/tests/resources/test_dataset.py +++ b/tests/resources/test_dataset.py @@ -329,8 +329,12 @@ def test_from_cloud(self): gs_path = f"gs://{bucket_name}/{path_to_dataset_directory}" for location_parameters in ( - {"bucket_name": bucket_name, "path_to_dataset_directory": path_to_dataset_directory, "gs_path": None}, - {"bucket_name": None, "path_to_dataset_directory": None, "gs_path": gs_path}, + { + "bucket_name": bucket_name, + "path_to_dataset_directory": path_to_dataset_directory, + "cloud_path": None, + }, + {"bucket_name": None, "path_to_dataset_directory": None, "cloud_path": gs_path}, ): persisted_dataset = Dataset.from_cloud( @@ -375,8 +379,8 @@ def test_to_cloud(self): gs_path = storage.path.generate_gs_path(bucket_name, output_directory) for location_parameters in ( - {"bucket_name": bucket_name, "output_directory": output_directory, "gs_path": None}, - {"bucket_name": None, "output_directory": None, "gs_path": gs_path}, + {"bucket_name": bucket_name, "output_directory": output_directory, "cloud_path": None}, + {"bucket_name": None, "output_directory": None, "cloud_path": gs_path}, ): dataset.to_cloud(project_name, **location_parameters) diff --git a/tests/resources/test_manifest.py b/tests/resources/test_manifest.py index 79cd00e27..b423ac5f6 100644 --- a/tests/resources/test_manifest.py +++ b/tests/resources/test_manifest.py @@ -84,8 +84,8 @@ def test_to_cloud(self): gs_path = storage.path.generate_gs_path(bucket_name, path_to_manifest_file) for location_parameters in ( - {"bucket_name": bucket_name, "path_to_manifest_file": path_to_manifest_file, "gs_path": None}, - {"bucket_name": None, "path_to_manifest_file": None, "gs_path": gs_path}, + {"bucket_name": bucket_name, "path_to_manifest_file": path_to_manifest_file, "cloud_path": None}, + {"bucket_name": None, "path_to_manifest_file": None, "cloud_path": gs_path}, ): manifest.to_cloud(self.TEST_PROJECT_NAME, **location_parameters) @@ -173,8 +173,8 @@ def test_from_cloud(self): gs_path = storage.path.generate_gs_path(bucket_name, path_to_manifest_file) for location_parameters in ( - {"bucket_name": bucket_name, "path_to_manifest_file": path_to_manifest_file, "gs_path": None}, - {"bucket_name": None, "path_to_manifest_file": None, "gs_path": gs_path}, + {"bucket_name": bucket_name, "path_to_manifest_file": path_to_manifest_file, "cloud_path": None}, + {"bucket_name": None, "path_to_manifest_file": None, "cloud_path": gs_path}, ): persisted_manifest = Manifest.from_cloud(project_name=self.TEST_PROJECT_NAME, **location_parameters) From 87d1a53f423cbf7c890f1668c4d6932f593c70e6 Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 2 Jun 2021 12:20:44 +0100 Subject: [PATCH 7/8] DOC: Add parameter descriptions to cloud methods --- octue/resources/datafile.py | 27 ++++++++++++--------------- octue/resources/dataset.py | 18 +++++++++--------- octue/resources/manifest.py | 16 ++++++++-------- 3 files changed, 29 insertions(+), 32 deletions(-) diff --git a/octue/resources/datafile.py b/octue/resources/datafile.py index 35f529731..4f9f08c84 100644 --- a/octue/resources/datafile.py +++ b/octue/resources/datafile.py @@ -177,16 +177,13 @@ def from_cloud( Either (`bucket_name` and `datafile_path`) or `cloud_path` must be provided. - :param str project_name: - :param str|None cloud_path: - :param str|None bucket_name: - :param str|None datafile_path: path to file represented by datafile - :param bool allow_overwrite: if `True`, allow attributes of the datafile to be overwritten by values given in - kwargs - :param str mode: if using as a context manager, open the datafile for reading/editing in this mode (the mode - options are the same as for the builtin open function) - :param bool update_cloud_metadata: if using as a context manager and this is True, update the cloud metadata of - the datafile when the context is exited + :param str project_name: name of Google Cloud project datafile is stored in + :param str|None cloud_path: full path to datafile in cloud storage (e.g. `gs://bucket_name/path/to/file.csv`) + :param str|None bucket_name: name of bucket datafile is stored in + :param str|None datafile_path: cloud storage path of datafile (e.g. `path/to/file.csv`) + :param bool allow_overwrite: if `True`, allow attributes of the datafile to be overwritten by values given in kwargs + :param str mode: if using as a context manager, open the datafile for reading/editing in this mode (the mode options are the same as for the builtin open function) + :param bool update_cloud_metadata: if using as a context manager and this is True, update the cloud metadata of the datafile when the context is exited :return Datafile: """ if not cloud_path: @@ -223,11 +220,11 @@ def to_cloud( """Upload a datafile to Google Cloud Storage. Either (`bucket_name` and `path_in_bucket`) or `cloud_path` must be provided. - :param str|None project_name: - :param str|None cloud_path: - :param str|None bucket_name: - :param str|None path_in_bucket: - :param bool update_cloud_metadata: + :param str|None project_name: name of Google Cloud project to store datafile in + :param str|None cloud_path: full path to cloud storage location to store datafile at (e.g. `gs://bucket_name/path/to/file.csv`) + :param str|None bucket_name: name of bucket to store datafile in + :param str|None path_in_bucket: cloud storage path to store datafile at (e.g. `path/to/file.csv`) + :param bool update_cloud_metadata: if `True`, update the metadata of the datafile in the cloud at upload time :return str: gs:// path for datafile """ project_name, bucket_name, path_in_bucket = self._get_cloud_location( diff --git a/octue/resources/dataset.py b/octue/resources/dataset.py index ef37bfbf2..edc753eb9 100644 --- a/octue/resources/dataset.py +++ b/octue/resources/dataset.py @@ -59,10 +59,10 @@ def from_cloud(cls, project_name, cloud_path=None, bucket_name=None, path_to_dat """Instantiate a Dataset from Google Cloud storage. Either (`bucket_name` and `path_to_dataset_directory`) or `cloud_path` must be provided. - :param str project_name: - :param str|None cloud_path: - :param str|None bucket_name: - :param str|None path_to_dataset_directory: path to dataset directory (directory containing dataset's files) + :param str project_name: name of Google Cloud project dataset is stored in + :param str|None cloud_path: full path to dataset in cloud storage (e.g. `gs://bucket_name/path/to/dataset`) + :param str|None bucket_name: name of bucket dataset is stored in + :param str|None path_to_dataset_directory: path to dataset directory (containing dataset's files) in cloud (e.g. `path/to/dataset`) :return Dataset: """ if cloud_path: @@ -96,11 +96,11 @@ def to_cloud(self, project_name, cloud_path=None, bucket_name=None, output_direc """Upload a dataset to a cloud location. Either (`bucket_name` and `output_directory`) or `cloud_path` must be provided. - :param str project_name: - :param str|None cloud_path: - :param str|None bucket_name: - :param str|None output_directory: - :return str: gs:// path for dataset + :param str project_name: name of Google Cloud project to store dataset in + :param str|None cloud_path: full cloud storage path to store dataset at (e.g. `gs://bucket_name/path/to/dataset`) + :param str|None bucket_name: name of bucket to store dataset in + :param str|None output_directory: path to output directory in cloud storage (e.g. `path/to/dataset`) + :return str: cloud path for dataset """ if cloud_path: bucket_name, output_directory = storage.path.split_bucket_name_from_gs_path(cloud_path) diff --git a/octue/resources/manifest.py b/octue/resources/manifest.py index 8525fec30..fd84e1815 100644 --- a/octue/resources/manifest.py +++ b/octue/resources/manifest.py @@ -49,10 +49,10 @@ def from_cloud(cls, project_name, cloud_path=None, bucket_name=None, path_to_man """Instantiate a Manifest from Google Cloud storage. Either (`bucket_name` and `path_to_manifest_file`) or `cloud_path` must be provided. - :param str project_name: - :param str|None cloud_path: - :param str|None bucket_name: - :param str|None path_to_manifest_file: + :param str project_name: name of Google Cloud project manifest is stored in + :param str|None cloud_path: full path to manifest in cloud storage (e.g. `gs://bucket_name/path/to/manifest.json`) + :param str|None bucket_name: name of bucket manifest is stored in + :param str|None path_to_manifest_file: path to manifest in cloud storage e.g. `path/to/manifest.json` :return Dataset: """ if cloud_path: @@ -88,10 +88,10 @@ def to_cloud( """Upload a manifest to a cloud location, optionally uploading its datasets into the same directory. Either (`bucket_name` and `path_to_manifest_file`) or `cloud_path` must be provided. - :param str project_name: - :param str|None cloud_path: - :param str|None bucket_name: - :param str|None path_to_manifest_file: + :param str project_name: name of Google Cloud project to store manifest in + :param str|None cloud_path: full path to cloud storage location to store manifest at (e.g. `gs://bucket_name/path/to/manifest.json`) + :param str|None bucket_name: name of bucket to store manifest in + :param str|None path_to_manifest_file: cloud storage path to store manifest at e.g. `path/to/manifest.json` :param bool store_datasets: if True, upload datasets to same directory as manifest file :return str: gs:// path for manifest file """ From 6400f95c537359b1ce9630d57141e6ad5a73067e Mon Sep 17 00:00:00 2001 From: cortadocodes Date: Wed, 2 Jun 2021 13:06:46 +0100 Subject: [PATCH 8/8] IMP: Serialise datetimes in ISO format --- octue/utils/encoders.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/octue/utils/encoders.py b/octue/utils/encoders.py index 94b39d19d..59604bf27 100644 --- a/octue/utils/encoders.py +++ b/octue/utils/encoders.py @@ -13,7 +13,7 @@ def default(self, obj): return obj.serialise() if isinstance(obj, datetime.datetime): - return str(obj) + return obj.isoformat() # Otherwise let the base class default method raise the TypeError return TwinedEncoder.default(self, obj)