diff --git a/pandas/core/algorithms.py b/pandas/core/algorithms.py index 23ed053521baf..9d0e2145567bf 100644 --- a/pandas/core/algorithms.py +++ b/pandas/core/algorithms.py @@ -450,6 +450,9 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None): unique1d = unique +_MINIMUM_COMP_ARR_LEN = 1_000_000 + + def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: """ Compute the isin boolean array. @@ -518,7 +521,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]: # Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array), # in1d is faster for small sizes if ( - len(comps_array) > 1_000_000 + len(comps_array) > _MINIMUM_COMP_ARR_LEN and len(values) <= 26 and comps_array.dtype != object ): diff --git a/pandas/io/formats/csvs.py b/pandas/io/formats/csvs.py index 672f7c1f71b15..3b759010d1abb 100644 --- a/pandas/io/formats/csvs.py +++ b/pandas/io/formats/csvs.py @@ -45,6 +45,9 @@ from pandas.io.formats.format import DataFrameFormatter +_DEFAULT_CHUNKSIZE_CELLS = 100_000 + + class CSVFormatter: cols: np.ndarray @@ -163,7 +166,7 @@ def _initialize_columns(self, cols: Sequence[Hashable] | None) -> np.ndarray: def _initialize_chunksize(self, chunksize: int | None) -> int: if chunksize is None: - return (100000 // (len(self.cols) or 1)) or 1 + return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1 return int(chunksize) @property diff --git a/pandas/tests/frame/methods/test_to_csv.py b/pandas/tests/frame/methods/test_to_csv.py index 5671a569c8ac8..ee9c4f05991a0 100644 --- a/pandas/tests/frame/methods/test_to_csv.py +++ b/pandas/tests/frame/methods/test_to_csv.py @@ -752,13 +752,16 @@ def test_to_csv_chunking(self, chunksize): tm.assert_frame_equal(rs, aa) @pytest.mark.slow - def test_to_csv_wide_frame_formatting(self): + def test_to_csv_wide_frame_formatting(self, monkeypatch): # Issue #8621 - df = DataFrame(np.random.randn(1, 100010), columns=None, index=None) + chunksize = 100 + df = DataFrame(np.random.randn(1, chunksize + 10), columns=None, index=None) with tm.ensure_clean() as filename: - df.to_csv(filename, header=False, index=False) + with monkeypatch.context() as m: + m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize) + df.to_csv(filename, header=False, index=False) rs = read_csv(filename, header=None) - tm.assert_frame_equal(rs, df) + tm.assert_frame_equal(rs, df) def test_to_csv_bug(self): f1 = StringIO("a,1.0\nb,2.0") diff --git a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py index 932457eebcd8e..e0868745a480a 100644 --- a/pandas/tests/indexing/multiindex/test_chaining_and_caching.py +++ b/pandas/tests/indexing/multiindex/test_chaining_and_caching.py @@ -70,13 +70,11 @@ def test_indexer_caching(): # GH5727 # make sure that indexers are in the _internal_names_set n = 1000001 - arrays = (range(n), range(n)) - index = MultiIndex.from_tuples(zip(*arrays)) + index = MultiIndex.from_arrays([np.arange(n), np.arange(n)]) s = Series(np.zeros(n), index=index) str(s) # setitem expected = Series(np.ones(n), index=index) - s = Series(np.zeros(n), index=index) s[s == 0] = 1 tm.assert_series_equal(s, expected) diff --git a/pandas/tests/io/parser/test_index_col.py b/pandas/tests/io/parser/test_index_col.py index cd12ecc4fc7cc..4b0567d6265ad 100644 --- a/pandas/tests/io/parser/test_index_col.py +++ b/pandas/tests/io/parser/test_index_col.py @@ -228,16 +228,18 @@ def test_header_with_index_col(all_parsers): @pytest.mark.slow -def test_index_col_large_csv(all_parsers): +def test_index_col_large_csv(all_parsers, monkeypatch): # https://github.com/pandas-dev/pandas/issues/37094 parser = all_parsers - N = 1_000_001 - df = DataFrame({"a": range(N), "b": np.random.randn(N)}) + ARR_LEN = 100 + df = DataFrame({"a": range(ARR_LEN + 1), "b": np.random.randn(ARR_LEN + 1)}) with tm.ensure_clean() as path: df.to_csv(path, index=False) - result = parser.read_csv(path, index_col=[0]) + with monkeypatch.context() as m: + m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN) + result = parser.read_csv(path, index_col=[0]) tm.assert_frame_equal(result, df.set_index("a")) diff --git a/pandas/tests/plotting/frame/test_frame.py b/pandas/tests/plotting/frame/test_frame.py index 01762e39c36c1..52fbfc23ef66c 100644 --- a/pandas/tests/plotting/frame/test_frame.py +++ b/pandas/tests/plotting/frame/test_frame.py @@ -423,44 +423,60 @@ def test_line_area_stacked(self, kind): df2 = df.set_index(df.index + 1) _check_plot_works(df2.plot, kind=kind, logx=True, stacked=True) - def test_line_area_nan_df(self): + @pytest.mark.parametrize( + "idx", [range(4), date_range("2023-01-1", freq="D", periods=4)] + ) + def test_line_area_nan_df(self, idx): values1 = [1, 2, np.nan, 3] values2 = [3, np.nan, 2, 1] - df = DataFrame({"a": values1, "b": values2}) - tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4)) - - for d in [df, tdf]: - ax = _check_plot_works(d.plot) - masked1 = ax.lines[0].get_ydata() - masked2 = ax.lines[1].get_ydata() - # remove nan for comparison purpose - - exp = np.array([1, 2, 3], dtype=np.float64) - tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp) - - exp = np.array([3, 2, 1], dtype=np.float64) - tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp) - tm.assert_numpy_array_equal( - masked1.mask, np.array([False, False, True, False]) - ) - tm.assert_numpy_array_equal( - masked2.mask, np.array([False, True, False, False]) - ) + df = DataFrame({"a": values1, "b": values2}, index=idx) - expected1 = np.array([1, 2, 0, 3], dtype=np.float64) - expected2 = np.array([3, 0, 2, 1], dtype=np.float64) + ax = _check_plot_works(df.plot) + masked1 = ax.lines[0].get_ydata() + masked2 = ax.lines[1].get_ydata() + # remove nan for comparison purpose - ax = _check_plot_works(d.plot, stacked=True) - tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + exp = np.array([1, 2, 3], dtype=np.float64) + tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp) - ax = _check_plot_works(d.plot.area) - tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) - tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + exp = np.array([3, 2, 1], dtype=np.float64) + tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp) + tm.assert_numpy_array_equal(masked1.mask, np.array([False, False, True, False])) + tm.assert_numpy_array_equal(masked2.mask, np.array([False, True, False, False])) + + @pytest.mark.parametrize( + "idx", [range(4), date_range("2023-01-1", freq="D", periods=4)] + ) + def test_line_area_nan_df_stacked(self, idx): + values1 = [1, 2, np.nan, 3] + values2 = [3, np.nan, 2, 1] + df = DataFrame({"a": values1, "b": values2}, index=idx) + + expected1 = np.array([1, 2, 0, 3], dtype=np.float64) + expected2 = np.array([3, 0, 2, 1], dtype=np.float64) + + ax = _check_plot_works(df.plot, stacked=True) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) + + @pytest.mark.parametrize( + "idx", [range(4), date_range("2023-01-1", freq="D", periods=4)] + ) + @pytest.mark.parametrize("kwargs", [{}, {"stacked": False}]) + def test_line_area_nan_df_stacked_area(self, idx, kwargs): + values1 = [1, 2, np.nan, 3] + values2 = [3, np.nan, 2, 1] + df = DataFrame({"a": values1, "b": values2}, index=idx) + + expected1 = np.array([1, 2, 0, 3], dtype=np.float64) + expected2 = np.array([3, 0, 2, 1], dtype=np.float64) - ax = _check_plot_works(d.plot.area, stacked=False) - tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + ax = _check_plot_works(df.plot.area, **kwargs) + tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1) + if kwargs: tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2) + else: + tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2) def test_line_lim(self): df = DataFrame(np.random.rand(6, 3), columns=["x", "y", "z"]) @@ -1537,27 +1553,31 @@ def test_errorbar_with_integer_column_names(self): _check_has_errorbars(ax, xerr=0, yerr=1) @pytest.mark.slow - def test_errorbar_with_partial_columns(self): + @pytest.mark.parametrize("kind", ["line", "bar"]) + def test_errorbar_with_partial_columns_kind(self, kind): df = DataFrame(np.abs(np.random.randn(10, 3))) df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2]) - kinds = ["line", "bar"] - for kind in kinds: - ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) - _check_has_errorbars(ax, xerr=0, yerr=2) + ax = _check_plot_works(df.plot, yerr=df_err, kind=kind) + _check_has_errorbars(ax, xerr=0, yerr=2) + @pytest.mark.slow + def test_errorbar_with_partial_columns_dti(self): + df = DataFrame(np.abs(np.random.randn(10, 3))) + df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2]) ix = date_range("1/1/2000", periods=10, freq="M") df.set_index(ix, inplace=True) df_err.set_index(ix, inplace=True) ax = _check_plot_works(df.plot, yerr=df_err, kind="line") _check_has_errorbars(ax, xerr=0, yerr=2) + @pytest.mark.slow + @pytest.mark.parametrize("err_box", [lambda x: x, DataFrame]) + def test_errorbar_with_partial_columns_box(self, err_box): d = {"x": np.arange(12), "y": np.arange(12, 0, -1)} df = DataFrame(d) - d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4} - df_err = DataFrame(d_err) - for err in [d_err, df_err]: - ax = _check_plot_works(df.plot, yerr=err) - _check_has_errorbars(ax, xerr=0, yerr=1) + err = err_box({"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4}) + ax = _check_plot_works(df.plot, yerr=err) + _check_has_errorbars(ax, xerr=0, yerr=1) @pytest.mark.parametrize("kind", ["line", "bar", "barh"]) def test_errorbar_timeseries(self, kind): diff --git a/pandas/tests/plotting/frame/test_frame_subplots.py b/pandas/tests/plotting/frame/test_frame_subplots.py index 336fed6293070..9546731b0d8fa 100644 --- a/pandas/tests/plotting/frame/test_frame_subplots.py +++ b/pandas/tests/plotting/frame/test_frame_subplots.py @@ -245,15 +245,11 @@ def test_subplots_layout_single_column( assert axes.shape == expected_shape @pytest.mark.slow - def test_subplots_warnings(self): + @pytest.mark.parametrize("idx", [range(5), date_range("1/1/2000", periods=5)]) + def test_subplots_warnings(self, idx): # GH 9464 with tm.assert_produces_warning(None): - df = DataFrame(np.random.randn(100, 4)) - df.plot(subplots=True, layout=(3, 2)) - - df = DataFrame( - np.random.randn(100, 4), index=date_range("1/1/2000", periods=100) - ) + df = DataFrame(np.random.randn(5, 4), index=idx) df.plot(subplots=True, layout=(3, 2)) def test_subplots_multiple_axes(self): diff --git a/pandas/tests/plotting/test_misc.py b/pandas/tests/plotting/test_misc.py index f6b50aeb3139d..9c5de2918b9a4 100644 --- a/pandas/tests/plotting/test_misc.py +++ b/pandas/tests/plotting/test_misc.py @@ -218,28 +218,27 @@ def test_andrews_curves_handle(self): _check_colors(handles, linecolors=colors) @pytest.mark.slow - def test_parallel_coordinates(self, iris): - from matplotlib import cm - + @pytest.mark.parametrize( + "color", + [("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]], + ) + def test_parallel_coordinates_colors(self, iris, color): from pandas.plotting import parallel_coordinates df = iris - ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") - nlines = len(ax.get_lines()) - nxticks = len(ax.xaxis.get_ticklabels()) - - rgba = ("#556270", "#4ECDC4", "#C7F464") ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=rgba + parallel_coordinates, frame=df, class_column="Name", color=color ) - _check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10]) + _check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10]) - cnames = ["dodgerblue", "aquamarine", "seagreen"] - ax = _check_plot_works( - parallel_coordinates, frame=df, class_column="Name", color=cnames - ) - _check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10]) + @pytest.mark.slow + def test_parallel_coordinates_cmap(self, iris): + from matplotlib import cm + + from pandas.plotting import parallel_coordinates + + df = iris ax = _check_plot_works( parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet @@ -247,15 +246,30 @@ def test_parallel_coordinates(self, iris): cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())] _check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10]) + @pytest.mark.slow + def test_parallel_coordinates_line_diff(self, iris): + from pandas.plotting import parallel_coordinates + + df = iris + + ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name") + nlines = len(ax.get_lines()) + nxticks = len(ax.xaxis.get_ticklabels()) + ax = _check_plot_works( parallel_coordinates, frame=df, class_column="Name", axvlines=False ) assert len(ax.get_lines()) == (nlines - nxticks) + @pytest.mark.slow + def test_parallel_coordinates_handles(self, iris): + from pandas.plotting import parallel_coordinates + + df = iris colors = ["b", "g", "r"] df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors}) ax = parallel_coordinates(df, "Name", color=colors) - handles, labels = ax.get_legend_handles_labels() + handles, _ = ax.get_legend_handles_labels() _check_colors(handles, linecolors=colors) # not sure if this is indicative of a problem