Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion pandas/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -450,6 +450,9 @@ def unique_with_mask(values, mask: npt.NDArray[np.bool_] | None = None):
unique1d = unique


_MINIMUM_COMP_ARR_LEN = 1_000_000


def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
"""
Compute the isin boolean array.
Expand Down Expand Up @@ -518,7 +521,7 @@ def isin(comps: ListLike, values: ListLike) -> npt.NDArray[np.bool_]:
# Albeit hashmap has O(1) look-up (vs. O(logn) in sorted array),
# in1d is faster for small sizes
if (
len(comps_array) > 1_000_000
len(comps_array) > _MINIMUM_COMP_ARR_LEN
and len(values) <= 26
and comps_array.dtype != object
):
Expand Down
5 changes: 4 additions & 1 deletion pandas/io/formats/csvs.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,6 +45,9 @@
from pandas.io.formats.format import DataFrameFormatter


_DEFAULT_CHUNKSIZE_CELLS = 100_000


class CSVFormatter:
cols: np.ndarray

Expand Down Expand Up @@ -163,7 +166,7 @@ def _initialize_columns(self, cols: Sequence[Hashable] | None) -> np.ndarray:

def _initialize_chunksize(self, chunksize: int | None) -> int:
if chunksize is None:
return (100000 // (len(self.cols) or 1)) or 1
return (_DEFAULT_CHUNKSIZE_CELLS // (len(self.cols) or 1)) or 1
return int(chunksize)

@property
Expand Down
11 changes: 7 additions & 4 deletions pandas/tests/frame/methods/test_to_csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -752,13 +752,16 @@ def test_to_csv_chunking(self, chunksize):
tm.assert_frame_equal(rs, aa)

@pytest.mark.slow
def test_to_csv_wide_frame_formatting(self):
def test_to_csv_wide_frame_formatting(self, monkeypatch):
# Issue #8621
df = DataFrame(np.random.randn(1, 100010), columns=None, index=None)
chunksize = 100
df = DataFrame(np.random.randn(1, chunksize + 10), columns=None, index=None)
with tm.ensure_clean() as filename:
df.to_csv(filename, header=False, index=False)
with monkeypatch.context() as m:
m.setattr("pandas.io.formats.csvs._DEFAULT_CHUNKSIZE_CELLS", chunksize)
df.to_csv(filename, header=False, index=False)
rs = read_csv(filename, header=None)
tm.assert_frame_equal(rs, df)
tm.assert_frame_equal(rs, df)

def test_to_csv_bug(self):
f1 = StringIO("a,1.0\nb,2.0")
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -70,13 +70,11 @@ def test_indexer_caching():
# GH5727
# make sure that indexers are in the _internal_names_set
n = 1000001
arrays = (range(n), range(n))
index = MultiIndex.from_tuples(zip(*arrays))
index = MultiIndex.from_arrays([np.arange(n), np.arange(n)])
s = Series(np.zeros(n), index=index)
str(s)

# setitem
expected = Series(np.ones(n), index=index)
s = Series(np.zeros(n), index=index)
s[s == 0] = 1
tm.assert_series_equal(s, expected)
10 changes: 6 additions & 4 deletions pandas/tests/io/parser/test_index_col.py
Original file line number Diff line number Diff line change
Expand Up @@ -228,16 +228,18 @@ def test_header_with_index_col(all_parsers):


@pytest.mark.slow
def test_index_col_large_csv(all_parsers):
def test_index_col_large_csv(all_parsers, monkeypatch):
# https://github.com/pandas-dev/pandas/issues/37094
parser = all_parsers

N = 1_000_001
df = DataFrame({"a": range(N), "b": np.random.randn(N)})
ARR_LEN = 100
df = DataFrame({"a": range(ARR_LEN + 1), "b": np.random.randn(ARR_LEN + 1)})

with tm.ensure_clean() as path:
df.to_csv(path, index=False)
result = parser.read_csv(path, index_col=[0])
with monkeypatch.context() as m:
m.setattr("pandas.core.algorithms._MINIMUM_COMP_ARR_LEN", ARR_LEN)
result = parser.read_csv(path, index_col=[0])

tm.assert_frame_equal(result, df.set_index("a"))

Expand Down
102 changes: 61 additions & 41 deletions pandas/tests/plotting/frame/test_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -423,44 +423,60 @@ def test_line_area_stacked(self, kind):
df2 = df.set_index(df.index + 1)
_check_plot_works(df2.plot, kind=kind, logx=True, stacked=True)

def test_line_area_nan_df(self):
@pytest.mark.parametrize(
"idx", [range(4), date_range("2023-01-1", freq="D", periods=4)]
)
def test_line_area_nan_df(self, idx):
values1 = [1, 2, np.nan, 3]
values2 = [3, np.nan, 2, 1]
df = DataFrame({"a": values1, "b": values2})
tdf = DataFrame({"a": values1, "b": values2}, index=tm.makeDateIndex(k=4))

for d in [df, tdf]:
ax = _check_plot_works(d.plot)
masked1 = ax.lines[0].get_ydata()
masked2 = ax.lines[1].get_ydata()
# remove nan for comparison purpose

exp = np.array([1, 2, 3], dtype=np.float64)
tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp)

exp = np.array([3, 2, 1], dtype=np.float64)
tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp)
tm.assert_numpy_array_equal(
masked1.mask, np.array([False, False, True, False])
)
tm.assert_numpy_array_equal(
masked2.mask, np.array([False, True, False, False])
)
df = DataFrame({"a": values1, "b": values2}, index=idx)

expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
expected2 = np.array([3, 0, 2, 1], dtype=np.float64)
ax = _check_plot_works(df.plot)
masked1 = ax.lines[0].get_ydata()
masked2 = ax.lines[1].get_ydata()
# remove nan for comparison purpose

ax = _check_plot_works(d.plot, stacked=True)
tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)
exp = np.array([1, 2, 3], dtype=np.float64)
tm.assert_numpy_array_equal(np.delete(masked1.data, 2), exp)

ax = _check_plot_works(d.plot.area)
tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)
exp = np.array([3, 2, 1], dtype=np.float64)
tm.assert_numpy_array_equal(np.delete(masked2.data, 1), exp)
tm.assert_numpy_array_equal(masked1.mask, np.array([False, False, True, False]))
tm.assert_numpy_array_equal(masked2.mask, np.array([False, True, False, False]))

@pytest.mark.parametrize(
"idx", [range(4), date_range("2023-01-1", freq="D", periods=4)]
)
def test_line_area_nan_df_stacked(self, idx):
values1 = [1, 2, np.nan, 3]
values2 = [3, np.nan, 2, 1]
df = DataFrame({"a": values1, "b": values2}, index=idx)

expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
expected2 = np.array([3, 0, 2, 1], dtype=np.float64)

ax = _check_plot_works(df.plot, stacked=True)
tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)

@pytest.mark.parametrize(
"idx", [range(4), date_range("2023-01-1", freq="D", periods=4)]
)
@pytest.mark.parametrize("kwargs", [{}, {"stacked": False}])
def test_line_area_nan_df_stacked_area(self, idx, kwargs):
values1 = [1, 2, np.nan, 3]
values2 = [3, np.nan, 2, 1]
df = DataFrame({"a": values1, "b": values2}, index=idx)

expected1 = np.array([1, 2, 0, 3], dtype=np.float64)
expected2 = np.array([3, 0, 2, 1], dtype=np.float64)

ax = _check_plot_works(d.plot.area, stacked=False)
tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
ax = _check_plot_works(df.plot.area, **kwargs)
tm.assert_numpy_array_equal(ax.lines[0].get_ydata(), expected1)
if kwargs:
tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected2)
else:
tm.assert_numpy_array_equal(ax.lines[1].get_ydata(), expected1 + expected2)

def test_line_lim(self):
df = DataFrame(np.random.rand(6, 3), columns=["x", "y", "z"])
Expand Down Expand Up @@ -1537,27 +1553,31 @@ def test_errorbar_with_integer_column_names(self):
_check_has_errorbars(ax, xerr=0, yerr=1)

@pytest.mark.slow
def test_errorbar_with_partial_columns(self):
@pytest.mark.parametrize("kind", ["line", "bar"])
def test_errorbar_with_partial_columns_kind(self, kind):
df = DataFrame(np.abs(np.random.randn(10, 3)))
df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2])
kinds = ["line", "bar"]
for kind in kinds:
ax = _check_plot_works(df.plot, yerr=df_err, kind=kind)
_check_has_errorbars(ax, xerr=0, yerr=2)
ax = _check_plot_works(df.plot, yerr=df_err, kind=kind)
_check_has_errorbars(ax, xerr=0, yerr=2)

@pytest.mark.slow
def test_errorbar_with_partial_columns_dti(self):
df = DataFrame(np.abs(np.random.randn(10, 3)))
df_err = DataFrame(np.abs(np.random.randn(10, 2)), columns=[0, 2])
ix = date_range("1/1/2000", periods=10, freq="M")
df.set_index(ix, inplace=True)
df_err.set_index(ix, inplace=True)
ax = _check_plot_works(df.plot, yerr=df_err, kind="line")
_check_has_errorbars(ax, xerr=0, yerr=2)

@pytest.mark.slow
@pytest.mark.parametrize("err_box", [lambda x: x, DataFrame])
def test_errorbar_with_partial_columns_box(self, err_box):
d = {"x": np.arange(12), "y": np.arange(12, 0, -1)}
df = DataFrame(d)
d_err = {"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4}
df_err = DataFrame(d_err)
for err in [d_err, df_err]:
ax = _check_plot_works(df.plot, yerr=err)
_check_has_errorbars(ax, xerr=0, yerr=1)
err = err_box({"x": np.ones(12) * 0.2, "z": np.ones(12) * 0.4})
ax = _check_plot_works(df.plot, yerr=err)
_check_has_errorbars(ax, xerr=0, yerr=1)

@pytest.mark.parametrize("kind", ["line", "bar", "barh"])
def test_errorbar_timeseries(self, kind):
Expand Down
10 changes: 3 additions & 7 deletions pandas/tests/plotting/frame/test_frame_subplots.py
Original file line number Diff line number Diff line change
Expand Up @@ -245,15 +245,11 @@ def test_subplots_layout_single_column(
assert axes.shape == expected_shape

@pytest.mark.slow
def test_subplots_warnings(self):
@pytest.mark.parametrize("idx", [range(5), date_range("1/1/2000", periods=5)])
def test_subplots_warnings(self, idx):
# GH 9464
with tm.assert_produces_warning(None):
df = DataFrame(np.random.randn(100, 4))
df.plot(subplots=True, layout=(3, 2))

df = DataFrame(
np.random.randn(100, 4), index=date_range("1/1/2000", periods=100)
)
df = DataFrame(np.random.randn(5, 4), index=idx)
df.plot(subplots=True, layout=(3, 2))

def test_subplots_multiple_axes(self):
Expand Down
46 changes: 30 additions & 16 deletions pandas/tests/plotting/test_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,44 +218,58 @@ def test_andrews_curves_handle(self):
_check_colors(handles, linecolors=colors)

@pytest.mark.slow
def test_parallel_coordinates(self, iris):
from matplotlib import cm

@pytest.mark.parametrize(
"color",
[("#556270", "#4ECDC4", "#C7F464"), ["dodgerblue", "aquamarine", "seagreen"]],
)
def test_parallel_coordinates_colors(self, iris, color):
from pandas.plotting import parallel_coordinates

df = iris

ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name")
nlines = len(ax.get_lines())
nxticks = len(ax.xaxis.get_ticklabels())

rgba = ("#556270", "#4ECDC4", "#C7F464")
ax = _check_plot_works(
parallel_coordinates, frame=df, class_column="Name", color=rgba
parallel_coordinates, frame=df, class_column="Name", color=color
)
_check_colors(ax.get_lines()[:10], linecolors=rgba, mapping=df["Name"][:10])
_check_colors(ax.get_lines()[:10], linecolors=color, mapping=df["Name"][:10])

cnames = ["dodgerblue", "aquamarine", "seagreen"]
ax = _check_plot_works(
parallel_coordinates, frame=df, class_column="Name", color=cnames
)
_check_colors(ax.get_lines()[:10], linecolors=cnames, mapping=df["Name"][:10])
@pytest.mark.slow
def test_parallel_coordinates_cmap(self, iris):
from matplotlib import cm

from pandas.plotting import parallel_coordinates

df = iris

ax = _check_plot_works(
parallel_coordinates, frame=df, class_column="Name", colormap=cm.jet
)
cmaps = [cm.jet(n) for n in np.linspace(0, 1, df["Name"].nunique())]
_check_colors(ax.get_lines()[:10], linecolors=cmaps, mapping=df["Name"][:10])

@pytest.mark.slow
def test_parallel_coordinates_line_diff(self, iris):
from pandas.plotting import parallel_coordinates

df = iris

ax = _check_plot_works(parallel_coordinates, frame=df, class_column="Name")
nlines = len(ax.get_lines())
nxticks = len(ax.xaxis.get_ticklabels())

ax = _check_plot_works(
parallel_coordinates, frame=df, class_column="Name", axvlines=False
)
assert len(ax.get_lines()) == (nlines - nxticks)

@pytest.mark.slow
def test_parallel_coordinates_handles(self, iris):
from pandas.plotting import parallel_coordinates

df = iris
colors = ["b", "g", "r"]
df = DataFrame({"A": [1, 2, 3], "B": [1, 2, 3], "C": [1, 2, 3], "Name": colors})
ax = parallel_coordinates(df, "Name", color=colors)
handles, labels = ax.get_legend_handles_labels()
handles, _ = ax.get_legend_handles_labels()
_check_colors(handles, linecolors=colors)

# not sure if this is indicative of a problem
Expand Down