diff --git a/opteryx/__version__.py b/opteryx/__version__.py index 13b65c90..7b474b29 100644 --- a/opteryx/__version__.py +++ b/opteryx/__version__.py @@ -1,4 +1,4 @@ -__build__ = 238 +__build__ = 240 # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. diff --git a/opteryx/connectors/aws_s3_connector.py b/opteryx/connectors/aws_s3_connector.py index 6154456d..329b83c1 100644 --- a/opteryx/connectors/aws_s3_connector.py +++ b/opteryx/connectors/aws_s3_connector.py @@ -71,7 +71,9 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]: bucket + "/" + blob.object_name for blob in blobs if not blob.object_name.endswith("/") ) - return [blob for blob in blobs if ("." + blob.split(".")[-1].lower()) in VALID_EXTENSIONS] + return sorted( + blob for blob in blobs if ("." + blob.split(".")[-1].lower()) in VALID_EXTENSIONS + ) def read_dataset(self, columns: list = None, **kwargs) -> pyarrow.Table: blob_names = self.partition_scheme.get_blobs_in_partition( diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py index f93f9a29..575ce78a 100644 --- a/opteryx/connectors/disk_connector.py +++ b/opteryx/connectors/disk_connector.py @@ -101,12 +101,12 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]: Returns: A list of blob filenames. """ - return [ + return sorted( os.path.join(root, file) for root, _, files in os.walk(prefix) for file in files if os.path.splitext(file)[1] in VALID_EXTENSIONS - ] + ) def read_dataset( self, columns: list = None, predicates: list = None, just_schema: bool = False, **kwargs diff --git a/opteryx/connectors/gcp_cloudstorage_connector.py b/opteryx/connectors/gcp_cloudstorage_connector.py index 2df9f5ba..7f4d3afd 100644 --- a/opteryx/connectors/gcp_cloudstorage_connector.py +++ b/opteryx/connectors/gcp_cloudstorage_connector.py @@ -135,12 +135,12 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]: raise Exception(f"Error fetching blob list: {response.text}") blob_data = response.json() - blob_names = [ + blob_names = sorted( f"{bucket}/{blob['name']}" for blob in blob_data.get("items", []) if not blob["name"].endswith("/") and any(blob["name"].endswith(ext) for ext in VALID_EXTENSIONS) - ] + ) return blob_names