Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

TST/CLN: Remove makeStringIndex #56155

Merged
merged 13 commits into from
Nov 27, 2023
15 changes: 9 additions & 6 deletions asv_bench/benchmarks/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,6 @@

import pandas as pd

from .pandas_vb_common import tm

for imp in ["pandas.util", "pandas.tools.hashing"]:
try:
hashing = import_module(imp)
Expand Down Expand Up @@ -47,9 +45,12 @@ def setup(self, unique, sort, dtype):
elif dtype == "datetime64[ns, tz]":
data = pd.date_range("2011-01-01", freq="h", periods=N, tz="Asia/Tokyo")
elif dtype == "object_str":
data = tm.makeStringIndex(N)
data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "string[pyarrow]":
data = pd.array(tm.makeStringIndex(N), dtype="string[pyarrow]")
data = pd.array(
pd.Index([f"i-{i}" for i in range(N)], dtype=object),
dtype="string[pyarrow]",
)
else:
raise NotImplementedError

Expand Down Expand Up @@ -88,7 +89,7 @@ def setup(self, unique, keep, dtype):
elif dtype == "float64":
data = pd.Index(np.random.randn(N), dtype="float64")
elif dtype == "string":
data = tm.makeStringIndex(N)
data = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "datetime64[ns]":
data = pd.date_range("2011-01-01", freq="h", periods=N)
elif dtype == "datetime64[ns, tz]":
Expand Down Expand Up @@ -136,7 +137,9 @@ def setup_cache(self):
df = pd.DataFrame(
{
"strings": pd.Series(
tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=N))
pd.Index([f"i-{i}" for i in range(10000)], dtype=object).take(
np.random.randint(0, 10000, size=N)
)
),
"floats": np.random.randn(N),
"ints": np.arange(N),
Expand Down
6 changes: 3 additions & 3 deletions asv_bench/benchmarks/algos/isin.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,6 @@
date_range,
)

from ..pandas_vb_common import tm


class IsIn:
params = [
Expand Down Expand Up @@ -60,7 +58,9 @@ def setup(self, dtype):

elif dtype in ["str", "string[python]", "string[pyarrow]"]:
try:
self.series = Series(tm.makeStringIndex(N), dtype=dtype)
self.series = Series(
Index([f"i-{i}" for i in range(N)], dtype=object), dtype=dtype
)
except ImportError:
raise NotImplementedError
self.values = list(self.series[:2])
Expand Down
4 changes: 1 addition & 3 deletions asv_bench/benchmarks/ctors.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,6 @@
date_range,
)

from .pandas_vb_common import tm


def no_change(arr):
return arr
Expand Down Expand Up @@ -115,7 +113,7 @@ def time_dtindex_from_index_with_series(self):
class MultiIndexConstructor:
def setup(self):
N = 10**4
self.iterables = [tm.makeStringIndex(N), range(20)]
self.iterables = [Index([f"i-{i}" for i in range(N)], dtype=object), range(20)]

def time_multiindex_from_iterables(self):
MultiIndex.from_product(self.iterables)
Expand Down
9 changes: 6 additions & 3 deletions asv_bench/benchmarks/dtypes.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,10 @@
import numpy as np

import pandas as pd
from pandas import DataFrame
from pandas import (
DataFrame,
Index,
)
import pandas._testing as tm
from pandas.api.types import (
is_extension_array_dtype,
Expand Down Expand Up @@ -73,8 +76,8 @@ class SelectDtypes:

def setup(self, dtype):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
self.index = Index([f"i-{i}" for i in range(N)], dtype=object)
self.columns = Index([f"i-{i}" for i in range(K)], dtype=object)

def create_df(data):
return DataFrame(data, index=self.index, columns=self.columns)
Expand Down
6 changes: 2 additions & 4 deletions asv_bench/benchmarks/frame_ctor.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
date_range,
)

from .pandas_vb_common import tm

try:
from pandas.tseries.offsets import (
Hour,
Expand All @@ -30,8 +28,8 @@
class FromDicts:
def setup(self):
N, K = 5000, 50
self.index = tm.makeStringIndex(N)
self.columns = tm.makeStringIndex(K)
self.index = pd.Index([f"i-{i}" for i in range(N)], dtype=object)
self.columns = pd.Index([f"i-{i}" for i in range(K)], dtype=object)
frame = DataFrame(np.random.randn(N, K), index=self.index, columns=self.columns)
self.data = frame.to_dict()
self.dict_list = frame.to_dict(orient="records")
Expand Down
11 changes: 7 additions & 4 deletions asv_bench/benchmarks/frame_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@

from pandas import (
DataFrame,
Index,
MultiIndex,
NaT,
Series,
Expand All @@ -14,8 +15,6 @@
timedelta_range,
)

from .pandas_vb_common import tm


class AsType:
params = [
Expand Down Expand Up @@ -703,8 +702,12 @@ def setup(self, monotonic):
K = 10
df = DataFrame(
{
"key1": tm.makeStringIndex(N).values.repeat(K),
"key2": tm.makeStringIndex(N).values.repeat(K),
"key1": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(
K
),
"key2": Index([f"i-{i}" for i in range(N)], dtype=object).values.repeat(
K
),
"value": np.random.randn(N * K),
}
)
Expand Down
6 changes: 2 additions & 4 deletions asv_bench/benchmarks/gil.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,15 +5,14 @@

from pandas import (
DataFrame,
Index,
Series,
date_range,
factorize,
read_csv,
)
from pandas.core.algorithms import take_nd

from .pandas_vb_common import tm

try:
from pandas import (
rolling_kurt,
Expand All @@ -34,7 +33,6 @@
except ImportError:
from pandas import algos


from .pandas_vb_common import BaseIO # isort:skip


Expand Down Expand Up @@ -305,7 +303,7 @@ class ParallelFactorize:
param_names = ["threads"]

def setup(self, threads):
strings = tm.makeStringIndex(100000)
strings = Index([f"i-{i}" for i in range(100000)], dtype=object)

@test_parallel(num_threads=threads)
def parallel():
Expand Down
12 changes: 7 additions & 5 deletions asv_bench/benchmarks/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,8 +17,6 @@
to_timedelta,
)

from .pandas_vb_common import tm

method_blocklist = {
"object": {
"diff",
Expand Down Expand Up @@ -167,10 +165,14 @@ def setup_cache(self):
"int64_small": Series(np.random.randint(0, 100, size=size)),
"int64_large": Series(np.random.randint(0, 10000, size=size)),
"object_small": Series(
tm.makeStringIndex(100).take(np.random.randint(0, 100, size=size))
Index([f"i-{i}" for i in range(100)], dtype=object).take(
np.random.randint(0, 100, size=size)
)
),
"object_large": Series(
tm.makeStringIndex(10000).take(np.random.randint(0, 10000, size=size))
Index([f"i-{i}" for i in range(10000)], dtype=object).take(
np.random.randint(0, 10000, size=size)
)
),
}
return data
Expand Down Expand Up @@ -912,7 +914,7 @@ def setup(self):
n1 = 400
n2 = 250
index = MultiIndex(
levels=[np.arange(n1), tm.makeStringIndex(n2)],
levels=[np.arange(n1), Index([f"i-{i}" for i in range(n2)], dtype=object)],
codes=[np.repeat(range(n1), n2).tolist(), list(range(n2)) * n1],
names=["lev1", "lev2"],
)
Expand Down
11 changes: 7 additions & 4 deletions asv_bench/benchmarks/index_object.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,8 +12,6 @@
date_range,
)

from .pandas_vb_common import tm


class SetOperations:
params = (
Expand All @@ -30,7 +28,7 @@ def setup(self, index_structure, dtype, method):
date_str_left = Index(dates_left.strftime(fmt))
int_left = Index(np.arange(N))
ea_int_left = Index(np.arange(N), dtype="Int64")
str_left = tm.makeStringIndex(N)
str_left = Index([f"i-{i}" for i in range(N)], dtype=object)

data = {
"datetime": dates_left,
Expand Down Expand Up @@ -155,7 +153,12 @@ class Indexing:

def setup(self, dtype):
N = 10**6
self.idx = getattr(tm, f"make{dtype}Index")(N)
if dtype == "String":
self.idx = Index([f"i-{i}" for i in range(N)], dtype=object)
elif dtype == "Float":
self.idx = Index(np.arange(N), dtype=np.float64)
elif dtype == "Int":
self.idx = Index(np.arange(N), dtype=np.int64)
self.array_mask = (np.arange(N) % 3) == 0
self.series_mask = Series(self.array_mask)
self.sorted = self.idx.sort_values()
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/indexing.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,8 +22,6 @@
period_range,
)

from .pandas_vb_common import tm


class NumericSeriesIndexing:
params = [
Expand Down Expand Up @@ -124,7 +122,7 @@ class NonNumericSeriesIndexing:
def setup(self, index, index_structure):
N = 10**6
if index == "string":
index = tm.makeStringIndex(N)
index = Index([f"i-{i}" for i in range(N)], dtype=object)
elif index == "datetime":
index = date_range("1900", periods=N, freq="s")
elif index == "period":
Expand Down Expand Up @@ -156,8 +154,8 @@ def time_getitem_list_like(self, index, index_structure):

class DataFrameStringIndexing:
def setup(self):
index = tm.makeStringIndex(1000)
columns = tm.makeStringIndex(30)
index = Index([f"i-{i}" for i in range(1000)], dtype=object)
columns = Index([f"i-{i}" for i in range(30)], dtype=object)
with warnings.catch_warnings(record=True):
self.df = DataFrame(np.random.randn(1000, 30), index=index, columns=columns)
self.idx_scalar = index[100]
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/inference.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,6 +9,7 @@
import numpy as np

from pandas import (
Index,
NaT,
Series,
date_range,
Expand All @@ -17,10 +18,7 @@
to_timedelta,
)

from .pandas_vb_common import (
lib,
tm,
)
from .pandas_vb_common import lib


class ToNumeric:
Expand All @@ -31,7 +29,7 @@ def setup(self, errors):
N = 10000
self.float = Series(np.random.randn(N))
self.numstr = self.float.astype("str")
self.str = Series(tm.makeStringIndex(N))
self.str = Series(Index([f"i-{i}" for i in range(N)], dtype=object))

def time_from_float(self, errors):
to_numeric(self.float, errors=errors)
Expand Down
8 changes: 3 additions & 5 deletions asv_bench/benchmarks/io/csv.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,17 +10,15 @@
from pandas import (
Categorical,
DataFrame,
Index,
concat,
date_range,
period_range,
read_csv,
to_datetime,
)

from ..pandas_vb_common import (
BaseIO,
tm,
)
from ..pandas_vb_common import BaseIO


class ToCSV(BaseIO):
Expand Down Expand Up @@ -288,7 +286,7 @@ class ReadCSVSkipRows(BaseIO):

def setup(self, skiprows, engine):
N = 20000
index = tm.makeStringIndex(N)
index = Index([f"i-{i}" for i in range(N)], dtype=object)
df = DataFrame(
{
"float1": np.random.randn(N),
Expand Down
5 changes: 2 additions & 3 deletions asv_bench/benchmarks/io/excel.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,12 +12,11 @@
from pandas import (
DataFrame,
ExcelWriter,
Index,
date_range,
read_excel,
)

from ..pandas_vb_common import tm


def _generate_dataframe():
N = 2000
Expand All @@ -27,7 +26,7 @@ def _generate_dataframe():
columns=[f"float{i}" for i in range(C)],
index=date_range("20000101", periods=N, freq="h"),
)
df["object"] = tm.makeStringIndex(N)
df["object"] = Index([f"i-{i}" for i in range(N)], dtype=object)
return df


Expand Down