Merge pull request #1403 from mabel-dev/#1402

#1402
mabel-dev · Jan 26, 2024 · c7e06d7 · c7e06d7
2 parents 8844a48 + 72d8e55
commit c7e06d7
Show file tree

Hide file tree

Showing 4 changed files with 8 additions and 6 deletions.
diff --git a/opteryx/__version__.py b/opteryx/__version__.py
@@ -1,4 +1,4 @@
-__build__ = 238
+__build__ = 240
 
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.

diff --git a/opteryx/connectors/aws_s3_connector.py b/opteryx/connectors/aws_s3_connector.py
@@ -71,7 +71,9 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
             bucket + "/" + blob.object_name for blob in blobs if not blob.object_name.endswith("/")
         )
 
-        return [blob for blob in blobs if ("." + blob.split(".")[-1].lower()) in VALID_EXTENSIONS]
+        return sorted(
+            blob for blob in blobs if ("." + blob.split(".")[-1].lower()) in VALID_EXTENSIONS
+        )
 
     def read_dataset(self, columns: list = None, **kwargs) -> pyarrow.Table:
         blob_names = self.partition_scheme.get_blobs_in_partition(

diff --git a/opteryx/connectors/disk_connector.py b/opteryx/connectors/disk_connector.py
@@ -101,12 +101,12 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
         Returns:
             A list of blob filenames.
         """
-        return [
+        return sorted(
             os.path.join(root, file)
             for root, _, files in os.walk(prefix)
             for file in files
             if os.path.splitext(file)[1] in VALID_EXTENSIONS
-        ]
+        )
 
     def read_dataset(
         self, columns: list = None, predicates: list = None, just_schema: bool = False, **kwargs

diff --git a/opteryx/connectors/gcp_cloudstorage_connector.py b/opteryx/connectors/gcp_cloudstorage_connector.py
@@ -135,12 +135,12 @@ def get_list_of_blob_names(self, *, prefix: str) -> List[str]:
             raise Exception(f"Error fetching blob list: {response.text}")
 
         blob_data = response.json()
-        blob_names = [
+        blob_names = sorted(
             f"{bucket}/{blob['name']}"
             for blob in blob_data.get("items", [])
             if not blob["name"].endswith("/")
             and any(blob["name"].endswith(ext) for ext in VALID_EXTENSIONS)
-        ]
+        )
 
         return blob_names