Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TEST-#6996: Update tests in test_io.py #6997

Merged
merged 3 commits into from
Mar 5, 2024
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
154 changes: 81 additions & 73 deletions modin/pandas/test/test_io.py
Original file line number Diff line number Diff line change
Expand Up @@ -259,28 +259,36 @@ def _make_parquet_dir(
@pytest.mark.filterwarnings(default_to_pandas_ignore_string)
class TestCsv:
# delimiter tests
@pytest.mark.parametrize("sep", [None, "_", ",", ".", "\n"])
@pytest.mark.parametrize("delimiter", ["_", ",", ".", "\n"])
Comment on lines -262 to -263
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These parameters mean the same thing. When using them at the same time, an exception occurs - I put it in a separate test.

@pytest.mark.parametrize("sep", ["_", ",", "."])
@pytest.mark.parametrize("decimal", [".", "_"])
@pytest.mark.parametrize("thousands", [None, ",", "_", " "])
def test_read_csv_delimiters(
self, make_csv_file, sep, delimiter, decimal, thousands
):
def test_read_csv_seps(self, make_csv_file, sep, decimal, thousands):
unique_filename = make_csv_file(
delimiter=delimiter,
delimiter=sep,
thousands_separator=thousands,
decimal_separator=decimal,
)
eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=unique_filename,
delimiter=delimiter,
sep=sep,
decimal=decimal,
thousands=thousands,
)

@pytest.mark.parametrize("sep", [None, "_"])
@pytest.mark.parametrize("delimiter", [".", "_"])
def test_read_csv_seps_except(self, make_csv_file, sep, delimiter):
unique_filename = make_csv_file(delimiter=delimiter)
eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=unique_filename,
delimiter=delimiter,
sep=sep,
)

@pytest.mark.parametrize(
"dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"]
)
Expand All @@ -303,7 +311,7 @@ def comparator(df1, df2):
@pytest.mark.parametrize("header", ["infer", None, 0])
@pytest.mark.parametrize("index_col", [None, "col1"])
@pytest.mark.parametrize(
"names", [lib.no_default, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6", "c7"]]
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The last column is missing, this is a test to reproduce the same error as in pandas. It is redundant to test this situation with all combinations of parameters in this test.

"names", [lib.no_default, ["col1"], ["c1", "c2", "c3", "c4", "c5", "c6"]]
)
@pytest.mark.parametrize(
"usecols", [None, ["col1"], ["col1", "col2", "col6"], [0, 1, 5]]
Expand Down Expand Up @@ -794,75 +802,64 @@ def test_read_csv_error_handling(self, on_bad_lines):
on_bad_lines=on_bad_lines,
)

@pytest.mark.parametrize("float_precision", [None, "high", "legacy", "round_trip"])
def test_python_engine_float_precision_except(self, float_precision):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate on why python engine only?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already tested python engine in test_read_csv_internal, but I moved it to a separate test. Reason: #6997 (comment)

eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
engine="python",
float_precision=float_precision,
)

@pytest.mark.parametrize("low_memory", [False, True])
def test_python_engine_low_memory_except(self, low_memory):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

same

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We already tested python engine in test_read_csv_internal, but I moved it to a separate test. Reason: #6997 (comment)

eval_io(
fn_name="read_csv",
# read_csv kwargs
filepath_or_buffer=pytest.csvs_names["test_read_csv_regular"],
engine="python",
low_memory=low_memory,
)

@pytest.mark.parametrize("delim_whitespace", [True, False])
def test_delim_whitespace(self, delim_whitespace, tmp_path):
if StorageFormat.get() == "Hdk" and delim_whitespace:
pytest.xfail(reason="https://github.com/modin-project/modin/issues/6999")
str_delim_whitespaces = "col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n"
unique_filename = get_unique_filename(data_dir=tmp_path)
eval_io_from_str(
str_delim_whitespaces,
unique_filename,
delim_whitespace=delim_whitespace,
)

# Internal parameters tests
@pytest.mark.parametrize("use_str_data", [True, False])
@pytest.mark.parametrize("engine", [None, "python", "c"])
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Many parameters are not implemented for the python engine (and exceptions are thrown), moved to separate tests above.

@pytest.mark.parametrize("engine", ["c"])
@pytest.mark.parametrize("delimiter", [",", " "])
@pytest.mark.parametrize("delim_whitespace", [True, False])
@pytest.mark.parametrize("low_memory", [True, False])
@pytest.mark.parametrize("memory_map", [True, False])
@pytest.mark.parametrize("float_precision", [None, "high", "round_trip"])
def test_read_csv_internal(
self,
make_csv_file,
use_str_data,
engine,
delimiter,
delim_whitespace,
low_memory,
memory_map,
float_precision,
tmp_path,
):
# In this case raised TypeError: cannot use a string pattern on a bytes-like object,
# so TypeError should be excluded from raising_exceptions list in order to check, that
# the same exceptions are raised by Pandas and Modin
case_with_TypeError_exc = (
engine == "python"
and delimiter == ","
and delim_whitespace
and low_memory
and memory_map
and float_precision is None
unique_filename = make_csv_file(delimiter=delimiter)
eval_io(
filepath_or_buffer=unique_filename,
fn_name="read_csv",
engine=engine,
delimiter=delimiter,
low_memory=low_memory,
memory_map=memory_map,
float_precision=float_precision,
)

raising_exceptions = io_ops_bad_exc # default value
if case_with_TypeError_exc:
raising_exceptions = list(io_ops_bad_exc)
raising_exceptions.remove(TypeError)

kwargs = {
"engine": engine,
"delimiter": delimiter,
"delim_whitespace": delim_whitespace,
"low_memory": low_memory,
"memory_map": memory_map,
"float_precision": float_precision,
}

if use_str_data:
str_delim_whitespaces = (
"col1 col2 col3 col4\n5 6 7 8\n9 10 11 12\n"
)
unique_filename = get_unique_filename(data_dir=tmp_path)
eval_io_from_str(
str_delim_whitespaces,
unique_filename,
raising_exceptions=raising_exceptions,
**kwargs,
)
else:
unique_filename = make_csv_file(
delimiter=delimiter,
)

eval_io(
filepath_or_buffer=unique_filename,
fn_name="read_csv",
raising_exceptions=raising_exceptions,
**kwargs,
)

# Issue related, specific or corner cases
@pytest.mark.parametrize("nrows", [2, None])
def test_read_csv_bad_quotes(self, nrows):
Expand Down Expand Up @@ -2650,6 +2647,9 @@ def test_to_sql(self, tmp_path, make_sql_connection, index, conn_type):
assert df_modin_sql.sort_index().equals(df_pandas_sql.sort_index())


@pytest.mark.skipif(
StorageFormat.get() == "Hdk", reason="Missing optional dependency 'lxml'."
)
@pytest.mark.filterwarnings(default_to_pandas_ignore_string)
class TestHtml:
def test_read_html(self, make_html_file):
Expand Down Expand Up @@ -2756,20 +2756,19 @@ def test_fwf_file_usecols(self, make_fwf_file, usecols):
"dtype_backend", [lib.no_default, "numpy_nullable", "pyarrow"]
)
def test_read_fwf_dtype_backend(self, make_fwf_file, dtype_backend):
with ensure_clean(".fwf") as unique_filename:
make_fwf_file(filename=unique_filename)
Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

make_fwf_file fixture doesn't actually take a filename as a parameter.

unique_filename = make_fwf_file()

def comparator(df1, df2):
df_equals(df1, df2)
df_equals(df1.dtypes, df2.dtypes)
def comparator(df1, df2):
df_equals(df1, df2)
df_equals(df1.dtypes, df2.dtypes)

eval_io(
fn_name="read_fwf",
# read_csv kwargs
filepath_or_buffer=unique_filename,
dtype_backend=dtype_backend,
comparator=comparator,
)
eval_io(
fn_name="read_fwf",
# read_csv kwargs
filepath_or_buffer=unique_filename,
dtype_backend=dtype_backend,
comparator=comparator,
)

def test_fwf_file_chunksize(self, make_fwf_file):
unique_filename = make_fwf_file()
Expand Down Expand Up @@ -3079,6 +3078,9 @@ def test_to_pickle(self, tmp_path):
df_equals(modin_df, recreated_modin_df)


@pytest.mark.skipif(
StorageFormat.get() == "Hdk", reason="Missing optional dependency 'lxml'."
)
@pytest.mark.filterwarnings(default_to_pandas_ignore_string)
class TestXml:
def test_read_xml(self):
Expand All @@ -3095,6 +3097,12 @@ def test_read_xml(self):
<degrees>360</degrees>
<sides/>
</row>
<row>
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why is this added?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The example is outdated, I took the updated one from https://pandas.pydata.org/docs/reference/api/pandas.read_xml.html#pandas-read-xml.

<shape>triangle</shape>
<degrees>180</degrees>
<sides>3.0</sides>
</row>
</data>
"""
eval_io("read_xml", path_or_buffer=data)

Expand Down