-
Notifications
You must be signed in to change notification settings - Fork 651
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
TEST-#6830: Use local s3 server instead of public s3 buckets #6863
Changes from all commits
8f4c182
d5cf2ee
10ab1f9
2a0f94c
9139768
8b7fd48
a63401a
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -47,7 +47,9 @@ def _read(cls, path_or_buf, **kwargs): | |
path_or_buf = stringify_path(path_or_buf) | ||
path_or_buf = cls.get_path_or_buffer(path_or_buf) | ||
if isinstance(path_or_buf, str): | ||
if not cls.file_exists(path_or_buf): | ||
if not cls.file_exists( | ||
path_or_buf, storage_options=kwargs.get("storage_options") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use of |
||
): | ||
return cls.single_worker_read( | ||
path_or_buf, reason=cls._file_not_found_msg(path_or_buf), **kwargs | ||
) | ||
|
@@ -60,12 +62,21 @@ def _read(cls, path_or_buf, **kwargs): | |
return cls.single_worker_read( | ||
path_or_buf, reason="`lines` argument not supported", **kwargs | ||
) | ||
with OpenFile(path_or_buf, "rb") as f: | ||
with OpenFile( | ||
path_or_buf, | ||
"rb", | ||
**(kwargs.get("storage_options", None) or {}), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use of |
||
) as f: | ||
columns = pandas.read_json(BytesIO(b"" + f.readline()), lines=True).columns | ||
kwargs["columns"] = columns | ||
empty_pd_df = pandas.DataFrame(columns=columns) | ||
|
||
with OpenFile(path_or_buf, "rb", kwargs.get("compression", "infer")) as f: | ||
with OpenFile( | ||
path_or_buf, | ||
"rb", | ||
kwargs.get("compression", "infer"), | ||
**(kwargs.get("storage_options", None) or {}), | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use of |
||
) as f: | ||
column_widths, num_splits = cls._define_metadata(empty_pd_df, columns) | ||
args = {"fname": path_or_buf, "num_splits": num_splits, **kwargs} | ||
splits, _ = cls.partitioned_file( | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -68,7 +68,9 @@ def _read(cls, filepath_or_buffer, **kwargs): | |
reason=cls._file_not_found_msg(filepath_or_buffer), | ||
**kwargs, | ||
) | ||
filepath_or_buffer = cls.get_path(filepath_or_buffer) | ||
filepath_or_buffer = cls.get_path( | ||
filepath_or_buffer, kwargs.get("storage_options") | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Use of |
||
) | ||
elif not cls.pathlib_or_pypath(filepath_or_buffer): | ||
return cls.single_worker_read( | ||
filepath_or_buffer, | ||
|
@@ -314,14 +316,16 @@ def file_exists(cls, file_path: str, storage_options=None) -> bool: | |
return exists or len(fs.glob(file_path)) > 0 | ||
|
||
@classmethod | ||
def get_path(cls, file_path: str) -> list: | ||
def get_path(cls, file_path: str, storage_options=None) -> list: | ||
""" | ||
Return the path of the file(s). | ||
|
||
Parameters | ||
---------- | ||
file_path : str | ||
String representing a path. | ||
storage_options : dict, optional | ||
Keyword from `read_*` functions. | ||
|
||
Returns | ||
------- | ||
|
@@ -363,11 +367,17 @@ def get_file_path(fs_handle) -> List[str]: | |
fs_addresses = [fs_handle.unstrip_protocol(path) for path in file_paths] | ||
return fs_addresses | ||
|
||
fs, _ = fsspec.core.url_to_fs(file_path) | ||
if storage_options is not None: | ||
new_storage_options = dict(storage_options) | ||
new_storage_options.pop("anon", None) | ||
else: | ||
new_storage_options = {} | ||
|
||
fs, _ = fsspec.core.url_to_fs(file_path, **new_storage_options) | ||
Comment on lines
+370
to
+376
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Made by analogy with other functions |
||
try: | ||
return get_file_path(fs) | ||
except credential_error_type: | ||
fs, _ = fsspec.core.url_to_fs(file_path, anon=True) | ||
fs, _ = fsspec.core.url_to_fs(file_path, anon=True, **new_storage_options) | ||
return get_file_path(fs) | ||
|
||
@classmethod | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -180,11 +180,15 @@ def test_read_single_csv_with_parse_dates(self, parse_dates): | |
@pytest.mark.parametrize( | ||
"path", | ||
[ | ||
"s3://modin-datasets/testing/multiple_csv/test_data*.csv", | ||
"s3://modin-test/modin-bugs/multiple_csv/test_data*.csv", | ||
"gs://modin-testing/testing/multiple_csv/test_data*.csv", | ||
], | ||
) | ||
def test_read_multiple_csv_cloud_store(path): | ||
def test_read_multiple_csv_cloud_store(path, s3_resource, s3_storage_options): | ||
storage_options_new = {"anon": True} | ||
if path.startswith("s3"): | ||
storage_options_new = s3_storage_options | ||
|
||
def _pandas_read_csv_glob(path, storage_options): | ||
pandas_dfs = [ | ||
pandas.read_csv( | ||
|
@@ -200,7 +204,7 @@ def _pandas_read_csv_glob(path, storage_options): | |
lambda module, **kwargs: pd.read_csv_glob(path, **kwargs).reset_index(drop=True) | ||
if hasattr(module, "read_csv_glob") | ||
else _pandas_read_csv_glob(path, **kwargs), | ||
storage_options={"anon": True}, | ||
storage_options=storage_options_new, | ||
) | ||
|
||
|
||
|
@@ -212,17 +216,19 @@ def _pandas_read_csv_glob(path, storage_options): | |
reason=f"{Engine.get()} does not have experimental API", | ||
) | ||
@pytest.mark.parametrize( | ||
"storage_options", | ||
[{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}, None], | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We can't use There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. What if we want to add a test case for There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. At least, do not use our own s3 server, since in this case we always need to explicitly specify endpoint. |
||
"storage_options_extra", | ||
[{"anon": False}, {"anon": True}, {"key": "123", "secret": "123"}], | ||
) | ||
def test_read_multiple_csv_s3_storage_opts(storage_options): | ||
path = "s3://modin-datasets/testing/multiple_csv/" | ||
def test_read_multiple_csv_s3_storage_opts( | ||
s3_resource, s3_storage_options, storage_options_extra | ||
): | ||
s3_path = "s3://modin-test/modin-bugs/multiple_csv/" | ||
|
||
def _pandas_read_csv_glob(path, storage_options): | ||
pandas_df = pandas.concat( | ||
[ | ||
pandas.read_csv( | ||
f"{path}test_data{i}.csv", | ||
f"{s3_path}test_data{i}.csv", | ||
storage_options=storage_options, | ||
) | ||
for i in range(2) | ||
|
@@ -233,10 +239,10 @@ def _pandas_read_csv_glob(path, storage_options): | |
eval_general( | ||
pd, | ||
pandas, | ||
lambda module, **kwargs: pd.read_csv_glob(path, **kwargs) | ||
lambda module, **kwargs: pd.read_csv_glob(s3_path, **kwargs) | ||
if hasattr(module, "read_csv_glob") | ||
else _pandas_read_csv_glob(path, **kwargs), | ||
storage_options=storage_options, | ||
else _pandas_read_csv_glob(s3_path, **kwargs), | ||
storage_options=s3_storage_options | storage_options_extra, | ||
) | ||
|
||
|
||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
a,b,c | ||
0,True,x | ||
1,False,y | ||
2,True,z | ||
3,False,w |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,5 @@ | ||
a,b,c | ||
4,True,m | ||
5,False,n | ||
6,True,t | ||
7,True,l |
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,6 @@ | ||
{"Duration":60,"Pulse":110,"Maxpulse":130,"Calories":409} | ||
{"Duration":60,"Pulse":117,"Maxpulse":145,"Calories":479} | ||
{"Duration":60,"Pulse":103,"Maxpulse":135,"Calories":340} | ||
{"Duration":45,"Pulse":109,"Maxpulse":175,"Calories":282} | ||
{"Duration":45,"Pulse":117,"Maxpulse":148,"Calories":406} | ||
{"Duration":60,"Pulse":102,"Maxpulse":127,"Calories":300} |
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -44,6 +44,8 @@ def test_line_endings(): | |
if any(i in subdir for i in [".git", ".idea", "__pycache__"]): | ||
continue | ||
for file in files: | ||
if file.endswith(".parquet"): | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. In binary format there is no need to check these characters ( There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Why? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. This sequence of characters may have a different meaning for binary formats. If, for example, you try to replace |
||
continue | ||
filepath = os.path.join(subdir, file) | ||
with open(filepath, "rb+") as f: | ||
file_contents = f.read() | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Use of
storage_options
parameter was missed