Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PERF: Allow np.integer Series/Index to convert to RangeIndex #58016

Merged
merged 10 commits into from
Apr 1, 2024
15 changes: 10 additions & 5 deletions pandas/core/indexes/base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7157,17 +7157,22 @@ def maybe_sequence_to_range(sequence) -> Any | range:
-------
Any : input or range
"""
if isinstance(sequence, (ABCSeries, Index, range, ExtensionArray)):
if isinstance(sequence, (range, ExtensionArray)):
return sequence
elif len(sequence) == 1 or lib.infer_dtype(sequence, skipna=False) != "integer":
return sequence
elif len(sequence) == 0:
elif isinstance(sequence, (ABCSeries, Index)) and not (
isinstance(sequence.dtype, np.dtype) and sequence.dtype.kind == "i"
):
return sequence
if len(sequence) == 0:
return range(0)
diff = sequence[1] - sequence[0]
np_sequence = np.asarray(sequence, dtype=np.int64)
diff = np_sequence[1] - np_sequence[0]
if diff == 0:
return sequence
elif len(sequence) == 2 or lib.is_sequence_range(np.asarray(sequence), diff):
return range(sequence[0], sequence[-1] + diff, diff)
elif len(sequence) == 2 or lib.is_sequence_range(np_sequence, diff):
return range(np_sequence[0], np_sequence[-1] + diff, diff)
else:
return sequence

Expand Down
2 changes: 1 addition & 1 deletion pandas/tests/frame/methods/test_set_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -148,7 +148,7 @@ def test_set_index_dst(self):

def test_set_index(self, float_string_frame):
df = float_string_frame
idx = Index(np.arange(len(df))[::-1])
idx = Index(np.arange(len(df) - 1, -1, -1, dtype=np.int64))

df = df.set_index(idx)
tm.assert_index_equal(df.index, idx)
Expand Down
22 changes: 6 additions & 16 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -513,7 +513,6 @@ def test_read_write_reread_dta14(self, file, parsed_114, version, datapath):
written_and_read_again = self.read_dta(path)

expected = parsed_114.copy()
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)

@pytest.mark.parametrize(
Expand Down Expand Up @@ -576,7 +575,6 @@ def test_numeric_column_names(self):
written_and_read_again.columns = map(convert_col_name, columns)

expected = original
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(expected, written_and_read_again)

@pytest.mark.parametrize("version", [114, 117, 118, 119, None])
Expand All @@ -594,7 +592,6 @@ def test_nan_to_missing_value(self, version):

written_and_read_again = written_and_read_again.set_index("index")
expected = original
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(written_and_read_again, expected)

def test_no_index(self):
Expand All @@ -617,7 +614,6 @@ def test_string_no_dates(self):
written_and_read_again = self.read_dta(path)

expected = original
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)

def test_large_value_conversion(self):
Expand All @@ -637,7 +633,6 @@ def test_large_value_conversion(self):
modified["s1"] = Series(modified["s1"], dtype=np.int16)
modified["s2"] = Series(modified["s2"], dtype=np.int32)
modified["s3"] = Series(modified["s3"], dtype=np.float64)
modified.index = original.index.astype(np.int32)
tm.assert_frame_equal(written_and_read_again.set_index("index"), modified)

def test_dates_invalid_column(self):
Expand Down Expand Up @@ -713,7 +708,7 @@ def test_write_missing_strings(self):

expected = DataFrame(
[["1"], [""]],
index=pd.Index([0, 1], dtype=np.int32, name="index"),
index=pd.RangeIndex(2, name="index"),
columns=["foo"],
)

Expand Down Expand Up @@ -746,7 +741,6 @@ def test_bool_uint(self, byteorder, version):
written_and_read_again = written_and_read_again.set_index("index")

expected = original
expected.index = expected.index.astype(np.int32)
expected_types = (
np.int8,
np.int8,
Expand Down Expand Up @@ -1030,7 +1024,7 @@ def test_categorical_writing(self, version):
res = written_and_read_again.set_index("index")

expected = original
expected.index = expected.index.set_names("index").astype(np.int32)
expected.index = expected.index.set_names("index")

expected["incompletely_labeled"] = expected["incompletely_labeled"].apply(str)
expected["unlabeled"] = expected["unlabeled"].apply(str)
Expand Down Expand Up @@ -1094,7 +1088,6 @@ def test_categorical_with_stata_missing_values(self, version):
new_cats = cat.remove_unused_categories().categories
cat = cat.set_categories(new_cats, ordered=True)
expected[col] = cat
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(res, expected)

@pytest.mark.parametrize("file", ["stata10_115", "stata10_117"])
Expand Down Expand Up @@ -1544,7 +1537,6 @@ def test_out_of_range_float(self):

original["ColumnTooBig"] = original["ColumnTooBig"].astype(np.float64)
expected = original
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(reread.set_index("index"), expected)

@pytest.mark.parametrize("infval", [np.inf, -np.inf])
Expand Down Expand Up @@ -1669,7 +1661,6 @@ def test_writer_117(self):
original["int32"] = original["int32"].astype(np.int32)
original["float32"] = Series(original["float32"], dtype=np.float32)
original.index.name = "index"
original.index = original.index.astype(np.int32)
copy = original.copy()
with tm.ensure_clean() as path:
original.to_stata(
Expand Down Expand Up @@ -1962,7 +1953,7 @@ def test_read_write_ea_dtypes(self, dtype_backend):
# stata stores with ms unit, so unit does not round-trip exactly
"e": pd.date_range("2020-12-31", periods=3, freq="D", unit="ms"),
},
index=pd.Index([0, 1, 2], name="index", dtype=np.int32),
index=pd.RangeIndex(range(3), name="index"),
)

tm.assert_frame_equal(written_and_read_again.set_index("index"), expected)
Expand Down Expand Up @@ -2049,7 +2040,6 @@ def test_compression(compression, version, use_dict, infer, compression_to_exten
reread = read_stata(fp, index_col="index")

expected = df
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(reread, expected)


Expand All @@ -2075,7 +2065,6 @@ def test_compression_dict(method, file_ext):
reread = read_stata(fp, index_col="index")

expected = df
expected.index = expected.index.astype(np.int32)
tm.assert_frame_equal(reread, expected)


Expand All @@ -2085,7 +2074,6 @@ def test_chunked_categorical(version):
df.index.name = "index"

expected = df.copy()
expected.index = expected.index.astype(np.int32)

with tm.ensure_clean() as path:
df.to_stata(path, version=version)
Expand All @@ -2094,7 +2082,9 @@ def test_chunked_categorical(version):
block = block.set_index("index")
assert "cats" in block
tm.assert_series_equal(
block.cats, expected.cats.iloc[2 * i : 2 * (i + 1)]
block.cats,
expected.cats.iloc[2 * i : 2 * (i + 1)],
check_index_type=len(block) > 1,
)


Expand Down
19 changes: 12 additions & 7 deletions pandas/tests/reshape/merge/test_merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -2192,23 +2192,28 @@ def test_merge_on_indexes(self, how, sort, expected):

@pytest.mark.parametrize(
"index",
[Index([1, 2], dtype=dtyp, name="index_col") for dtyp in tm.ALL_REAL_NUMPY_DTYPES]
[
Index([1, 2, 4], dtype=dtyp, name="index_col")
for dtyp in tm.ALL_REAL_NUMPY_DTYPES
]
+ [
CategoricalIndex(["A", "B"], categories=["A", "B"], name="index_col"),
RangeIndex(start=0, stop=2, name="index_col"),
DatetimeIndex(["2018-01-01", "2018-01-02"], name="index_col"),
CategoricalIndex(["A", "B", "C"], categories=["A", "B", "C"], name="index_col"),
RangeIndex(start=0, stop=3, name="index_col"),
DatetimeIndex(["2018-01-01", "2018-01-02", "2018-01-03"], name="index_col"),
],
ids=lambda x: f"{type(x).__name__}[{x.dtype}]",
)
def test_merge_index_types(index):
# gh-20777
# assert key access is consistent across index types
left = DataFrame({"left_data": [1, 2]}, index=index)
right = DataFrame({"right_data": [1.0, 2.0]}, index=index)
left = DataFrame({"left_data": [1, 2, 3]}, index=index)
right = DataFrame({"right_data": [1.0, 2.0, 3.0]}, index=index)

result = left.merge(right, on=["index_col"])

expected = DataFrame({"left_data": [1, 2], "right_data": [1.0, 2.0]}, index=index)
expected = DataFrame(
{"left_data": [1, 2, 3], "right_data": [1.0, 2.0, 3.0]}, index=index
)
tm.assert_frame_equal(result, expected)


Expand Down